diff --git a/clang/docs/HIPSupport.rst b/clang/docs/HIPSupport.rst index b4a671e3cfa3c..ec2af2a6f569d 100644 --- a/clang/docs/HIPSupport.rst +++ b/clang/docs/HIPSupport.rst @@ -164,6 +164,8 @@ Predefined Macros - Represents wavefront memory scope in HIP (value is 2). * - ``__HIP_MEMORY_SCOPE_WORKGROUP`` - Represents workgroup memory scope in HIP (value is 3). + * - ``__HIP_MEMORY_SCOPE_CLUSTER`` + - Represents cluster memory scope in HIP (value is 6). * - ``__HIP_MEMORY_SCOPE_AGENT`` - Represents agent memory scope in HIP (value is 4). * - ``__HIP_MEMORY_SCOPE_SYSTEM`` diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 6bb99c757cd19..bef6e9c14b182 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -4846,6 +4846,7 @@ currently supported: * ``__MEMORY_SCOPE_SYSTEM`` * ``__MEMORY_SCOPE_DEVICE`` * ``__MEMORY_SCOPE_WRKGRP`` +* ``__MEMORY_SCOPE_CLUSTR`` * ``__MEMORY_SCOPE_WVFRNT`` * ``__MEMORY_SCOPE_SINGLE`` diff --git a/clang/include/clang/Basic/SyncScope.h b/clang/include/clang/Basic/SyncScope.h index 5a8d2a7dd02e5..7776c3d83a77d 100644 --- a/clang/include/clang/Basic/SyncScope.h +++ b/clang/include/clang/Basic/SyncScope.h @@ -43,11 +43,13 @@ enum class SyncScope { SystemScope, DeviceScope, WorkgroupScope, + ClusterScope, WavefrontScope, SingleScope, HIPSingleThread, HIPWavefront, HIPWorkgroup, + HIPCluster, HIPAgent, HIPSystem, OpenCLWorkGroup, @@ -65,6 +67,8 @@ inline llvm::StringRef getAsString(SyncScope S) { return "device_scope"; case SyncScope::WorkgroupScope: return "workgroup_scope"; + case SyncScope::ClusterScope: + return "cluster_scope"; case SyncScope::WavefrontScope: return "wavefront_scope"; case SyncScope::SingleScope: @@ -75,6 +79,8 @@ inline llvm::StringRef getAsString(SyncScope S) { return "hip_wavefront"; case SyncScope::HIPWorkgroup: return "hip_workgroup"; + case SyncScope::HIPCluster: + return "hip_cluster"; case SyncScope::HIPAgent: return "hip_agent"; case SyncScope::HIPSystem: @@ -174,13 +180,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel { /// The enum values match the pre-defined macros /// __HIP_MEMORY_SCOPE_*, which are used to define memory_scope_* /// enums in hip-c.h. + /// These may be present in pch files or bitcode so preserve existing values + /// when adding a new ID. enum ID { SingleThread = 1, Wavefront = 2, Workgroup = 3, Agent = 4, System = 5, - Last = System + Cluster = 6, + End, + Last = End - 1, + Count = Last }; AtomicScopeHIPModel() {} @@ -193,10 +204,14 @@ class AtomicScopeHIPModel : public AtomicScopeModel { return SyncScope::HIPWavefront; case Workgroup: return SyncScope::HIPWorkgroup; + case Cluster: + return SyncScope::HIPCluster; case Agent: return SyncScope::HIPAgent; case System: return SyncScope::HIPSystem; + case End: + break; } llvm_unreachable("Invalid language sync scope value"); } @@ -207,11 +222,12 @@ class AtomicScopeHIPModel : public AtomicScopeModel { } ArrayRef getRuntimeValues() const override { - static_assert(Last == System, "Does not include all sync scopes"); static const unsigned Scopes[] = { static_cast(SingleThread), static_cast(Wavefront), - static_cast(Workgroup), static_cast(Agent), - static_cast(System)}; + static_cast(Workgroup), static_cast(Cluster), + static_cast(System), static_cast(Agent)}; + static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count, + "Does not include all sync scopes"); return llvm::ArrayRef(Scopes); } @@ -223,14 +239,18 @@ class AtomicScopeHIPModel : public AtomicScopeModel { /// Defines the generic atomic scope model. class AtomicScopeGenericModel : public AtomicScopeModel { public: - /// The enum values match predefined built-in macros __ATOMIC_SCOPE_*. + /// The enum values match predefined built-in macros __MEMORY_SCOPE_*. + /// These may be present in pch files or bitcode so preserve existing values + /// when adding a new ID. enum ID { System = 0, Device = 1, Workgroup = 2, Wavefront = 3, Single = 4, - Last = Single + Cluster = 5, + Count, + Last = Count - 1 }; AtomicScopeGenericModel() = default; @@ -243,10 +263,14 @@ class AtomicScopeGenericModel : public AtomicScopeModel { return SyncScope::SystemScope; case Workgroup: return SyncScope::WorkgroupScope; + case Cluster: + return SyncScope::ClusterScope; case Wavefront: return SyncScope::WavefrontScope; case Single: return SyncScope::SingleScope; + case Count: + break; } llvm_unreachable("Invalid language sync scope value"); } @@ -256,11 +280,12 @@ class AtomicScopeGenericModel : public AtomicScopeModel { } ArrayRef getRuntimeValues() const override { - static_assert(Last == Single, "Does not include all sync scopes"); static const unsigned Scopes[] = { - static_cast(Device), static_cast(System), - static_cast(Workgroup), static_cast(Wavefront), - static_cast(Single)}; + static_cast(System), static_cast(Device), + static_cast(Workgroup), static_cast(Cluster), + static_cast(Wavefront), static_cast(Single)}; + static_assert(sizeof(Scopes) / sizeof(Scopes[0]) == Count, + "Does not include all sync scopes"); return llvm::ArrayRef(Scopes); } diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index 5049a0ab0a395..f49a5af2c9587 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -12,6 +12,7 @@ #include "CGBuiltin.h" #include "CodeGenFunction.h" +#include "clang/Basic/SyncScope.h" #include "clang/Basic/TargetBuiltins.h" #include "clang/Frontend/FrontendDiagnostic.h" #include "llvm/Analysis/ValueTracking.h" @@ -313,33 +314,33 @@ void CodeGenFunction::ProcessOrderScopeAMDGCN(Value *Order, Value *Scope, } // Older builtins had an enum argument for the memory scope. + const char *SSN = nullptr; int scope = cast(Scope)->getZExtValue(); switch (scope) { - case 0: // __MEMORY_SCOPE_SYSTEM + case AtomicScopeGenericModel::System: // __MEMORY_SCOPE_SYSTEM SSID = llvm::SyncScope::System; break; - case 1: // __MEMORY_SCOPE_DEVICE - if (getTarget().getTriple().isSPIRV()) - SSID = getLLVMContext().getOrInsertSyncScopeID("device"); - else - SSID = getLLVMContext().getOrInsertSyncScopeID("agent"); + case AtomicScopeGenericModel::Device: // __MEMORY_SCOPE_DEVICE + SSN = getTarget().getTriple().isSPIRV() ? "device" : "agent"; break; - case 2: // __MEMORY_SCOPE_WRKGRP - SSID = getLLVMContext().getOrInsertSyncScopeID("workgroup"); + case AtomicScopeGenericModel::Workgroup: // __MEMORY_SCOPE_WRKGRP + SSN = "workgroup"; break; - case 3: // __MEMORY_SCOPE_WVFRNT - if (getTarget().getTriple().isSPIRV()) - SSID = getLLVMContext().getOrInsertSyncScopeID("subgroup"); - else - SSID = getLLVMContext().getOrInsertSyncScopeID("wavefront"); + case AtomicScopeGenericModel::Cluster: // __MEMORY_SCOPE_CLUSTR + SSN = getTarget().getTriple().isSPIRV() ? "workgroup" : "cluster"; + break; + case AtomicScopeGenericModel::Wavefront: // __MEMORY_SCOPE_WVFRNT + SSN = getTarget().getTriple().isSPIRV() ? "subgroup" : "wavefront"; break; - case 4: // __MEMORY_SCOPE_SINGLE + case AtomicScopeGenericModel::Single: // __MEMORY_SCOPE_SINGLE SSID = llvm::SyncScope::SingleThread; break; default: SSID = llvm::SyncScope::System; break; } + if (SSN) + SSID = getLLVMContext().getOrInsertSyncScopeID(SSN); } llvm::Value *CodeGenFunction::EmitScalarOrConstFoldImmArg(unsigned ICEArguments, diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp index 16d5919d62cbb..0bc4b4b7025f2 100644 --- a/clang/lib/CodeGen/Targets/AMDGPU.cpp +++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp @@ -508,6 +508,10 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts, case SyncScope::WavefrontScope: Name = "wavefront"; break; + case SyncScope::HIPCluster: + case SyncScope::ClusterScope: + Name = "cluster"; + break; case SyncScope::HIPWorkgroup: case SyncScope::OpenCLWorkGroup: case SyncScope::WorkgroupScope: diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp index 3f6d4e0a9277a..80e096ecf5ae9 100644 --- a/clang/lib/CodeGen/Targets/SPIR.cpp +++ b/clang/lib/CodeGen/Targets/SPIR.cpp @@ -93,6 +93,8 @@ inline StringRef mapClangSyncScopeToLLVM(SyncScope Scope) { case SyncScope::OpenCLSubGroup: case SyncScope::WavefrontScope: return "subgroup"; + case SyncScope::HIPCluster: + case SyncScope::ClusterScope: case SyncScope::HIPWorkgroup: case SyncScope::OpenCLWorkGroup: case SyncScope::WorkgroupScope: diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp index baad63179d89a..47f1d5a6b636c 100644 --- a/clang/lib/Frontend/InitPreprocessor.cpp +++ b/clang/lib/Frontend/InitPreprocessor.cpp @@ -585,6 +585,7 @@ static void InitializeStandardPredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__HIP_MEMORY_SCOPE_WORKGROUP", "3"); Builder.defineMacro("__HIP_MEMORY_SCOPE_AGENT", "4"); Builder.defineMacro("__HIP_MEMORY_SCOPE_SYSTEM", "5"); + Builder.defineMacro("__HIP_MEMORY_SCOPE_CLUSTER", "6"); if (LangOpts.HIPStdPar) { Builder.defineMacro("__HIPSTDPAR__"); if (LangOpts.HIPStdParInterposeAlloc) { @@ -873,6 +874,7 @@ static void InitializePredefinedMacros(const TargetInfo &TI, Builder.defineMacro("__MEMORY_SCOPE_WRKGRP", "2"); Builder.defineMacro("__MEMORY_SCOPE_WVFRNT", "3"); Builder.defineMacro("__MEMORY_SCOPE_SINGLE", "4"); + Builder.defineMacro("__MEMORY_SCOPE_CLUSTR", "5"); // Define macros for the OpenCL memory scope. // The values should match AtomicScopeOpenCLModel::ID enum. diff --git a/clang/test/CodeGen/scoped-atomic-ops.c b/clang/test/CodeGen/scoped-atomic-ops.c index 545a6c90892c2..c39048120a457 100644 --- a/clang/test/CodeGen/scoped-atomic-ops.c +++ b/clang/test/CodeGen/scoped-atomic-ops.c @@ -1,113 +1,772 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ -// RUN: -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s +// RUN: -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ -// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s +// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \ // RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s -// AMDGCN-LABEL: define hidden i32 @fi1a( -// AMDGCN: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: define hidden spir_func i32 @fi1a( -// SPIRV: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:.+]] monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi1a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP1]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP5]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP9]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP11]], ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP12]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi1a( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP1]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP9]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_20-NEXT: store i32 [[TMP11]], ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP12]] +// +// SPIRV-LABEL: define hidden spir_func i32 @fi1a( +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[V:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP1]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP3:%.*]] = load atomic i32, ptr [[TMP2]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP5]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP7:%.*]] = load atomic i32, ptr [[TMP6]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP9]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP11:%.*]] = load atomic i32, ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP11]], ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: ret i32 [[TMP12]] +// int fi1a(int *i) { int v; __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); + __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); __scoped_atomic_load(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); return v; } // AMDGCN-LABEL: define hidden i32 @fi1b( -// AMDGCN: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0:[0-9]+]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP1]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP3]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN-NEXT: [[ATOMIC_TEMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP5]] to ptr +// AMDGCN-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("agent") monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 +// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4 +// AMDGCN-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("cluster") monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 +// AMDGCN-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("wavefront") monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4 +// AMDGCN-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4 +// AMDGCN-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 +// AMDGCN-NEXT: ret i32 [[TMP25]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi1b( -// SPIRV: [[TMP0:%.*]] = load atomic i32, ptr [[PTR0:%.+]] monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = load atomic i32, ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = load atomic i32, ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = load atomic i32, ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = load atomic i32, ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load atomic i32, ptr [[TMP0]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP1]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[TMP3]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP5:%.*]] = load atomic i32, ptr [[TMP4]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP5]], ptr [[ATOMIC_TEMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[ATOMIC_TEMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP6]], ptr [[TMP7]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP9:%.*]] = load atomic i32, ptr [[TMP8]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP9]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP10]], ptr [[TMP11]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP13:%.*]] = load atomic i32, ptr [[TMP12]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[ATOMIC_TEMP3]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load i32, ptr [[ATOMIC_TEMP3]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP14]], ptr [[TMP15]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP17:%.*]] = load atomic i32, ptr [[TMP16]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP21:%.*]] = load atomic i32, ptr [[TMP20]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP21]], ptr [[ATOMIC_TEMP5]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = load i32, ptr [[ATOMIC_TEMP5]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[TMP23]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4 +// SPIRV-NEXT: ret i32 [[TMP25]] +// int fi1b(int *i) { *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); + *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); *i = __scoped_atomic_load_n(i, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); return *i; } -// AMDGCN-LABEL: define hidden void @fi2a( -// AMDGCN: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi2a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[V_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[V_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi2a( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[V:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[V]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP2]], ptr [[TMP0]] monotonic, align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP5]], ptr [[TMP3]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP8]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP11]], ptr [[TMP9]] syncscope("cluster") monotonic, align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load i32, ptr [[TMP13]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP14]], ptr [[TMP12]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(5) [[V]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4 +// AMDGCN_CL_20-NEXT: store atomic i32 [[TMP17]], ptr [[TMP15]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi2a( -// SPIRV: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4 -// SPIRV: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[V:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[V]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP5:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP9:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[V]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: ret void +// void fi2a(int *i) { int v = 1; __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); + __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); __scoped_atomic_store(i, &v, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); } // AMDGCN-LABEL: define hidden void @fi2b( -// AMDGCN: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN-NEXT: [[ENTRY:.*:]] +// AMDGCN-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP2]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP4]] to ptr +// AMDGCN-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// AMDGCN-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("agent") monotonic, align 4 +// AMDGCN-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP2_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// AMDGCN-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("cluster") monotonic, align 4 +// AMDGCN-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP4_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("wavefront") monotonic, align 4 +// AMDGCN-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// AMDGCN-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi2b( -// SPIRV: store atomic i32 [[TMP0:%.+]], ptr [[PTR0:%.+]] monotonic, align 4 -// SPIRV: store atomic i32 [[TMP1:%.+]], ptr [[PTR1:%.+]] syncscope("device") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP2:%.+]], ptr [[PTR2:%.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP3:%.+]], ptr [[PTR3:%.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: store atomic i32 [[TMP4:%.+]], ptr [[PTR4:%.+]] syncscope("singlethread") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP1]], ptr [[TMP0]] monotonic, align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP3]], ptr [[TMP2]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP2]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTATOMICTMP2]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP5]], ptr [[TMP4]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP7]], ptr [[TMP6]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP4]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTATOMICTMP4]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP9]], ptr [[TMP8]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: store atomic i32 [[TMP11]], ptr [[TMP10]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: ret void +// void fi2b(int *i) { __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); + __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); __scoped_atomic_store_n(i, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); } -// AMDGCN-LABEL: define hidden void @fi3a( -// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4 -// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4 -// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4 -// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3:![0-9]+]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3a( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4:![0-9]+]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3a( -// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] monotonic, align 4 -// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] monotonic, align 4 -// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] monotonic, align 4 -// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] monotonic, align 4 +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); @@ -119,24 +778,357 @@ void fi3a(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } -// AMDGCN-LABEL: define hidden void @fi3b( -// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("agent") monotonic, align 4 -// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3b( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3b( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3b( -// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("device") monotonic, align 4 -// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("device") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); @@ -148,24 +1140,357 @@ void fi3b(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); } -// AMDGCN-LABEL: define hidden void @fi3c( -// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4 -// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3c( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3c( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3c( -// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("workgroup") monotonic, align 4 -// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("workgroup") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); @@ -177,24 +1502,719 @@ void fi3c(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); } -// AMDGCN-LABEL: define hidden void @fi3d( -// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("wavefront") monotonic, align 4 -// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3_clustr( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3_clustr( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// +// SPIRV-LABEL: define hidden spir_func void @fi3_clustr( +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// +void fi3_clustr(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { + *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *c = __scoped_atomic_fetch_and(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *d = __scoped_atomic_fetch_or(d, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *e = __scoped_atomic_fetch_xor(e, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *f = __scoped_atomic_fetch_nand(f, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *g = __scoped_atomic_fetch_min(g, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); +} + +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3d( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3d( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3d( -// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("subgroup") monotonic, align 4 -// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("subgroup") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); @@ -206,24 +2226,357 @@ void fi3d(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); } -// AMDGCN-LABEL: define hidden void @fi3e( -// AMDGCN: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4 -// AMDGCN: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden void @fi3e( +// AMDGCN_CL_DEF-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_DEF-NEXT: ret void +// +// AMDGCN_CL_20-LABEL: define hidden void @fi3e( +// AMDGCN_CL_20-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[E_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[E_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[F_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[F_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[G_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[G_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[H_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[H_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP1]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP2_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP2]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP3_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP3]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP4_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP4]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP5_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP5]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP6_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP6]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP7_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP7]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP8_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP8]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP9_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP9]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP10_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP10]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP11_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP11]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP12_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP12]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP13_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP13]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP14_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP14]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[A]], ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[B]], ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[E]], ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[F]], ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[G]], ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[H]], ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// AMDGCN_CL_20-NEXT: ret void +// // SPIRV-LABEL: define hidden spir_func void @fi3e( -// SPIRV: [[TMP0:%.*]] = atomicrmw add ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP1:%.*]] = atomicrmw sub ptr [[PTR1:%.+]], i32 [[VAL1:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP2:%.*]] = atomicrmw and ptr [[PTR2:%.+]], i32 [[VAL2:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP3:%.*]] = atomicrmw or ptr [[PTR3:%.+]], i32 [[VAL3:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP4:%.*]] = atomicrmw xor ptr [[PTR4:%.+]], i32 [[VAL4:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP5:%.*]] = atomicrmw nand ptr [[PTR5:%.+]], i32 [[VAL5:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP6:%.*]] = atomicrmw min ptr [[PTR6:%.+]], i32 [[VAL6:.+]] syncscope("singlethread") monotonic, align 4 -// SPIRV: [[TMP7:%.*]] = atomicrmw max ptr [[PTR7:%.+]], i32 [[VAL7:.+]] syncscope("singlethread") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[A:%.*]], ptr noundef [[B:%.*]], ptr noundef [[C:%.*]], ptr noundef [[D:%.*]], ptr noundef [[E:%.*]], ptr noundef [[F:%.*]], ptr noundef [[G:%.*]], ptr noundef [[H:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[A_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[B_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[E_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[F_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[G_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[H_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP1:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP2:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP3:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP4:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP5:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP6:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP7:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP8:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP9:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP10:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP11:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP12:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP13:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[ATOMIC_TEMP14:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[A]], ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[B]], ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[E]], ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[F]], ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[G]], ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[H]], ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw add ptr [[TMP0]], i32 [[TMP1]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP2]], ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = load i32, ptr [[ATOMIC_TEMP]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load ptr, ptr [[A_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[TMP4]], align 4 +// SPIRV-NEXT: [[TMP5:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTATOMICTMP1]], align 4 +// SPIRV-NEXT: [[TMP7:%.*]] = atomicrmw sub ptr [[TMP5]], i32 [[TMP6]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP7]], ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP8:%.*]] = load i32, ptr [[ATOMIC_TEMP2]], align 4 +// SPIRV-NEXT: [[TMP9:%.*]] = load ptr, ptr [[B_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP8]], ptr [[TMP9]], align 4 +// SPIRV-NEXT: [[TMP10:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTATOMICTMP3]], align 4 +// SPIRV-NEXT: [[TMP12:%.*]] = atomicrmw and ptr [[TMP10]], i32 [[TMP11]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP12]], ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP13:%.*]] = load i32, ptr [[ATOMIC_TEMP4]], align 4 +// SPIRV-NEXT: [[TMP14:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP13]], ptr [[TMP14]], align 4 +// SPIRV-NEXT: [[TMP15:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTATOMICTMP5]], align 4 +// SPIRV-NEXT: [[TMP17:%.*]] = atomicrmw or ptr [[TMP15]], i32 [[TMP16]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP17]], ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP18:%.*]] = load i32, ptr [[ATOMIC_TEMP6]], align 4 +// SPIRV-NEXT: [[TMP19:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP18]], ptr [[TMP19]], align 4 +// SPIRV-NEXT: [[TMP20:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTATOMICTMP7]], align 4 +// SPIRV-NEXT: [[TMP22:%.*]] = atomicrmw xor ptr [[TMP20]], i32 [[TMP21]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP22]], ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP23:%.*]] = load i32, ptr [[ATOMIC_TEMP8]], align 4 +// SPIRV-NEXT: [[TMP24:%.*]] = load ptr, ptr [[E_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP23]], ptr [[TMP24]], align 4 +// SPIRV-NEXT: [[TMP25:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP26:%.*]] = load i32, ptr [[DOTATOMICTMP9]], align 4 +// SPIRV-NEXT: [[TMP27:%.*]] = atomicrmw nand ptr [[TMP25]], i32 [[TMP26]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP27]], ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP28:%.*]] = load i32, ptr [[ATOMIC_TEMP10]], align 4 +// SPIRV-NEXT: [[TMP29:%.*]] = load ptr, ptr [[F_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP28]], ptr [[TMP29]], align 4 +// SPIRV-NEXT: [[TMP30:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP31:%.*]] = load i32, ptr [[DOTATOMICTMP11]], align 4 +// SPIRV-NEXT: [[TMP32:%.*]] = atomicrmw min ptr [[TMP30]], i32 [[TMP31]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP32]], ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP33:%.*]] = load i32, ptr [[ATOMIC_TEMP12]], align 4 +// SPIRV-NEXT: [[TMP34:%.*]] = load ptr, ptr [[G_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP33]], ptr [[TMP34]], align 4 +// SPIRV-NEXT: [[TMP35:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP36:%.*]] = load i32, ptr [[DOTATOMICTMP13]], align 4 +// SPIRV-NEXT: [[TMP37:%.*]] = atomicrmw max ptr [[TMP35]], i32 [[TMP36]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP37]], ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP38:%.*]] = load i32, ptr [[ATOMIC_TEMP14]], align 4 +// SPIRV-NEXT: [[TMP39:%.*]] = load ptr, ptr [[H_ADDR]], align 8 +// SPIRV-NEXT: store i32 [[TMP38]], ptr [[TMP39]], align 4 +// SPIRV-NEXT: ret void +// void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *a = __scoped_atomic_fetch_add(a, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); *b = __scoped_atomic_fetch_sub(b, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); @@ -235,10 +2588,98 @@ void fi3e(int *a, int *b, int *c, int *d, int *e, int *f, int *g, int *h) { *h = __scoped_atomic_fetch_max(h, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi4a( -// AMDGCN-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4a( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4a( -// SPIRV-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi4a(int *i) { int cmp = 0; int desired = 1; @@ -247,10 +2688,98 @@ _Bool fi4a(int *i) { __MEMORY_SCOPE_SYSTEM); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi4b( -// AMDGCN-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4b( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4b( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4b( -// SPIRV-DAG: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi4b(int *i) { int cmp = 0; int desired = 1; @@ -259,10 +2788,98 @@ _Bool fi4b(int *i) { __MEMORY_SCOPE_DEVICE); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi4c( -// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4c( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4c( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4c( -// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi4c(int *i) { int cmp = 0; int desired = 1; @@ -271,10 +2888,198 @@ _Bool fi4c(int *i) { __MEMORY_SCOPE_WRKGRP); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi4d( -// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4_clustr( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4_clustr( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("cluster") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// +// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4_clustr( +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// +_Bool fi4_clustr(int *i) { + int cmp = 0; + int desired = 1; + return __scoped_atomic_compare_exchange(i, &cmp, &desired, 0, + __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, + __MEMORY_SCOPE_CLUSTR); +} + +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4d( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4d( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4d( -// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi4d(int *i) { int cmp = 0; int desired = 1; @@ -283,10 +3088,98 @@ _Bool fi4d(int *i) { __MEMORY_SCOPE_WVFRNT); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi4e( -// AMDGCN: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi4e( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DESIRED_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi4e( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DESIRED:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: store i32 1, ptr addrspace(5) [[DESIRED]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = addrspacecast ptr addrspace(5) [[DESIRED]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP2]], i32 [[TMP3]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP6]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP5]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP6]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP7:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP7]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi4e( -// SPIRV: [[TMP0:%.*]] = cmpxchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DESIRED:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: store i32 1, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DESIRED]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi4e(int *i) { int cmp = 0; int desired = 1; @@ -295,10 +3188,98 @@ _Bool fi4e(int *i) { __MEMORY_SCOPE_SINGLE); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi5a( -// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5a( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5a( -// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi5a(int *i) { int cmp = 0; return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE, @@ -306,10 +3287,98 @@ _Bool fi5a(int *i) { __MEMORY_SCOPE_SYSTEM); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi5b( -// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5b( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5b( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("agent") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5b( -// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("device") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("device") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi5b(int *i) { int cmp = 0; return __scoped_atomic_compare_exchange_n(i, &cmp, 1, 1, __ATOMIC_ACQUIRE, @@ -317,127 +3386,1161 @@ _Bool fi5b(int *i) { __MEMORY_SCOPE_DEVICE); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi5c( -// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5c( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5c( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5c( -// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi5c(int *i) { int cmp = 0; return __scoped_atomic_compare_exchange_n( i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WRKGRP); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi5d( -// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5_clustr( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5_clustr( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("cluster") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// +// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5_clustr( +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("workgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// +_Bool fi5_clustr(int *i) { + int cmp = 0; + return __scoped_atomic_compare_exchange_n( + i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_CLUSTR); +} + +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5d( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5d( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("wavefront") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5d( -// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("subgroup") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("subgroup") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi5d(int *i) { int cmp = 0; return __scoped_atomic_compare_exchange_n( i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_WVFRNT); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi5e( -// AMDGCN: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi5e( +// AMDGCN_CL_DEF-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 0, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_DEF-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_DEF-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_DEF: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP4]], ptr [[CMP_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_DEF: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_DEF-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_DEF-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi5e( +// AMDGCN_CL_20-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[CMPXCHG_BOOL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[CMPXCHG_BOOL]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[I]], ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 0, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i32 1, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP_ASCAST]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// AMDGCN_CL_20-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// AMDGCN_CL_20: [[CMPXCHG_STORE_EXPECTED]]: +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr addrspace(5) [[CMP]], align 4 +// AMDGCN_CL_20-NEXT: br label %[[CMPXCHG_CONTINUE]] +// AMDGCN_CL_20: [[CMPXCHG_CONTINUE]]: +// AMDGCN_CL_20-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// AMDGCN_CL_20-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi5e( -// SPIRV: [[TMP0:%.*]] = cmpxchg weak ptr [[PTR0:%.+]], i32 [[VAL0:.+]], i32 [[VAL1:.+]] syncscope("singlethread") acquire acquire, align 4 +// SPIRV-SAME: ptr noundef [[I:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[I_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[CMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: [[CMPXCHG_BOOL:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[I]], ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 0, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[I_ADDR]], align 8 +// SPIRV-NEXT: store i32 1, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP1:%.*]] = load i32, ptr [[CMP]], align 4 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[DOTATOMICTMP]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = cmpxchg weak ptr [[TMP0]], i32 [[TMP1]], i32 [[TMP2]] syncscope("singlethread") acquire acquire, align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = extractvalue { i32, i1 } [[TMP3]], 0 +// SPIRV-NEXT: [[TMP5:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1 +// SPIRV-NEXT: br i1 [[TMP5]], label %[[CMPXCHG_CONTINUE:.*]], label %[[CMPXCHG_STORE_EXPECTED:.*]] +// SPIRV: [[CMPXCHG_STORE_EXPECTED]]: +// SPIRV-NEXT: store i32 [[TMP4]], ptr [[CMP]], align 4 +// SPIRV-NEXT: br label %[[CMPXCHG_CONTINUE]] +// SPIRV: [[CMPXCHG_CONTINUE]]: +// SPIRV-NEXT: [[STOREDV:%.*]] = zext i1 [[TMP5]] to i8 +// SPIRV-NEXT: store i8 [[STOREDV]], ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[TMP6:%.*]] = load i8, ptr [[CMPXCHG_BOOL]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP6]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi5e(int *i) { int cmp = 0; return __scoped_atomic_compare_exchange_n( i, &cmp, 1, 1, __ATOMIC_ACQUIRE, __ATOMIC_ACQUIRE, __MEMORY_SCOPE_SINGLE); } -// AMDGCN-LABEL: define hidden i32 @fi6a( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6a( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6a( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] monotonic, align 4 +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// int fi6a(int *c, int *d) { int ret; __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); return ret; } -// AMDGCN-LABEL: define hidden i32 @fi6b( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("agent") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6b( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6b( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("agent") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6b( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("device") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("device") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// int fi6b(int *c, int *d) { int ret; __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); return ret; } -// AMDGCN-LABEL: define hidden i32 @fi6c( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6c( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6c( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("workgroup") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6c( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("workgroup") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// int fi6c(int *c, int *d) { int ret; __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); return ret; } -// AMDGCN-LABEL: define hidden i32 @fi6d( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("wavefront") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6_clustr( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6_clustr( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("cluster") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// +// SPIRV-LABEL: define hidden spir_func i32 @fi6_clustr( +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("workgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// +int fi6_clustr(int *c, int *d) { + int ret; + __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR); + return ret; +} + +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6d( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6d( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("wavefront") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6d( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("subgroup") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("subgroup") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// int fi6d(int *c, int *d) { int ret; __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); return ret; } -// AMDGCN-LABEL: define hidden i32 @fi6e( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4 +// AMDGCN_CL_DEF-LABEL: define hidden i32 @fi6e( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[RET_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i32 [[TMP3]], ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET_ASCAST]], align 4 +// AMDGCN_CL_DEF-NEXT: ret i32 [[TMP4]] +// +// AMDGCN_CL_20-LABEL: define hidden i32 @fi6e( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RET:%.*]] = alloca i32, align 4, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[D_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[D_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store ptr [[D]], ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = addrspacecast ptr addrspace(5) [[RET]] to ptr +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP4:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP3]] syncscope("singlethread") monotonic, align 4, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i32 [[TMP4]], ptr [[TMP2]], align 4 +// AMDGCN_CL_20-NEXT: [[TMP5:%.*]] = load i32, ptr addrspace(5) [[RET]], align 4 +// AMDGCN_CL_20-NEXT: ret i32 [[TMP5]] +// // SPIRV-LABEL: define hidden spir_func i32 @fi6e( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i32 [[VAL0:.+]] syncscope("singlethread") monotonic, align 4 +// SPIRV-SAME: ptr noundef [[C:%.*]], ptr noundef [[D:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[D_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[RET:%.*]] = alloca i32, align 4 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store ptr [[D]], ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP1:%.*]] = load ptr, ptr [[D_ADDR]], align 8 +// SPIRV-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// SPIRV-NEXT: [[TMP3:%.*]] = atomicrmw xchg ptr [[TMP0]], i32 [[TMP2]] syncscope("singlethread") monotonic, align 4 +// SPIRV-NEXT: store i32 [[TMP3]], ptr [[RET]], align 4 +// SPIRV-NEXT: [[TMP4:%.*]] = load i32, ptr [[RET]], align 4 +// SPIRV-NEXT: ret i32 [[TMP4]] +// int fi6e(int *c, int *d) { int ret; __scoped_atomic_exchange(c, d, &ret, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); return ret; } -// AMDGCN-LABEL: define hidden zeroext i1 @fi7a( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7a( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7a( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7a( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] monotonic, align 1 +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi7a(_Bool *c) { return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SYSTEM); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi7b( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("agent") monotonic, align 1 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7b( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7b( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("agent") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7b( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("device") monotonic, align 1 +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("device") monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi7b(_Bool *c) { return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi7c( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7c( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7c( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7c( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("workgroup") monotonic, align 1 +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi7c(_Bool *c) { return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi7d( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("wavefront") monotonic, align 1 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7_clustr( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7_clustr( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("cluster") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// +// SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7_clustr( +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("workgroup") monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// +_Bool fi7_clustr(_Bool *c) { + return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, + __MEMORY_SCOPE_CLUSTR); +} + +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7d( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7d( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("wavefront") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7d( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("subgroup") monotonic, align 1 +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("subgroup") monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi7d(_Bool *c) { return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT); } -// AMDGCN-LABEL: define hidden zeroext i1 @fi7e( -// AMDGCN: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1 +// AMDGCN_CL_DEF-LABEL: define hidden zeroext i1 @fi7e( +// AMDGCN_CL_DEF-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_DEF-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_DEF-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_DEF-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_DEF-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_DEF-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_DEF-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_DEF-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_DEF-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META3]], !amdgpu.no.remote.memory [[META3]] +// AMDGCN_CL_DEF-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_DEF-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_DEF-NEXT: ret i1 [[LOADEDV]] +// +// AMDGCN_CL_20-LABEL: define hidden zeroext i1 @fi7e( +// AMDGCN_CL_20-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// AMDGCN_CL_20-NEXT: [[ENTRY:.*:]] +// AMDGCN_CL_20-NEXT: [[RETVAL:%.*]] = alloca i1, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8, addrspace(5) +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1, addrspace(5) +// AMDGCN_CL_20-NEXT: [[RETVAL_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[RETVAL]] to ptr +// AMDGCN_CL_20-NEXT: [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr +// AMDGCN_CL_20-NEXT: [[DOTATOMICTMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DOTATOMICTMP]] to ptr +// AMDGCN_CL_20-NEXT: [[ATOMIC_TEMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ATOMIC_TEMP]] to ptr +// AMDGCN_CL_20-NEXT: store ptr [[C]], ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR_ASCAST]], align 8 +// AMDGCN_CL_20-NEXT: store i8 1, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1, !amdgpu.no.fine.grained.memory [[META4]], !amdgpu.no.remote.memory [[META4]] +// AMDGCN_CL_20-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP_ASCAST]], align 1 +// AMDGCN_CL_20-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// AMDGCN_CL_20-NEXT: ret i1 [[LOADEDV]] +// // SPIRV-LABEL: define hidden spir_func zeroext i1 @fi7e( -// SPIRV: [[TMP0:%.*]] = atomicrmw xchg ptr [[PTR0:%.+]], i8 [[VAL0:.+]] syncscope("singlethread") monotonic, align 1 +// SPIRV-SAME: ptr noundef [[C:%.*]]) #[[ATTR0]] { +// SPIRV-NEXT: [[ENTRY:.*:]] +// SPIRV-NEXT: [[C_ADDR:%.*]] = alloca ptr, align 8 +// SPIRV-NEXT: [[DOTATOMICTMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: [[ATOMIC_TEMP:%.*]] = alloca i8, align 1 +// SPIRV-NEXT: store ptr [[C]], ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: [[TMP0:%.*]] = load ptr, ptr [[C_ADDR]], align 8 +// SPIRV-NEXT: store i8 1, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP1:%.*]] = load i8, ptr [[DOTATOMICTMP]], align 1 +// SPIRV-NEXT: [[TMP2:%.*]] = atomicrmw xchg ptr [[TMP0]], i8 [[TMP1]] syncscope("singlethread") monotonic, align 1 +// SPIRV-NEXT: store i8 [[TMP2]], ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[TMP3:%.*]] = load i8, ptr [[ATOMIC_TEMP]], align 1 +// SPIRV-NEXT: [[LOADEDV:%.*]] = trunc i8 [[TMP3]] to i1 +// SPIRV-NEXT: ret i1 [[LOADEDV]] +// _Bool fi7e(_Bool *c) { return __scoped_atomic_exchange_n(c, 1, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE); } +//. +// AMDGCN_CL_DEF: [[META3]] = !{} +//. +// AMDGCN_CL_20: [[META4]] = !{} +//. diff --git a/clang/test/CodeGen/scoped-fence-ops.c b/clang/test/CodeGen/scoped-fence-ops.c index d83ae05b0aea2..259e8d333e4c8 100644 --- a/clang/test/CodeGen/scoped-fence-ops.c +++ b/clang/test/CodeGen/scoped-fence-ops.c @@ -1,8 +1,8 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ -// RUN: -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s +// RUN: -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_DEF %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=amdgcn-amd-amdhsa -ffreestanding \ -// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefix=AMDGCN %s +// RUN: -cl-std=CL2.0 -fvisibility=hidden | FileCheck --check-prefixes=AMDGCN,AMDGCN_CL_20 %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=spirv64-unknown-unknown -ffreestanding \ // RUN: -fvisibility=hidden | FileCheck --check-prefix=SPIRV %s // RUN: %clang_cc1 %s -emit-llvm -o - -triple=x86_64-unknown-linux-gnu -ffreestanding \ @@ -127,23 +127,27 @@ void fe1b(int ord) { // AMDGCN-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR_ASCAST]], align 4 // AMDGCN-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR_ASCAST]], align 4 // AMDGCN-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ -// AMDGCN-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // AMDGCN-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// AMDGCN-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // AMDGCN-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// AMDGCN-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]] // AMDGCN-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] // AMDGCN-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] // AMDGCN-NEXT: ] // AMDGCN: [[ATOMIC_SCOPE_CONTINUE]]: // AMDGCN-NEXT: ret void -// AMDGCN: [[DEVICE_SCOPE]]: -// AMDGCN-NEXT: fence syncscope("agent") release -// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // AMDGCN: [[SYSTEM_SCOPE]]: // AMDGCN-NEXT: fence release // AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[DEVICE_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("agent") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // AMDGCN: [[WORKGROUP_SCOPE]]: // AMDGCN-NEXT: fence syncscope("workgroup") release // AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// AMDGCN: [[CLUSTER_SCOPE]]: +// AMDGCN-NEXT: fence syncscope("cluster") release +// AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // AMDGCN: [[WAVEFRONT_SCOPE]]: // AMDGCN-NEXT: fence syncscope("wavefront") release // AMDGCN-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] @@ -158,23 +162,27 @@ void fe1b(int ord) { // SPIRV-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4 // SPIRV-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4 // SPIRV-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ -// SPIRV-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // SPIRV-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// SPIRV-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // SPIRV-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// SPIRV-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]] // SPIRV-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] // SPIRV-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] // SPIRV-NEXT: ] // SPIRV: [[ATOMIC_SCOPE_CONTINUE]]: // SPIRV-NEXT: ret void -// SPIRV: [[DEVICE_SCOPE]]: -// SPIRV-NEXT: fence syncscope("device") release -// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // SPIRV: [[SYSTEM_SCOPE]]: // SPIRV-NEXT: fence release // SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[DEVICE_SCOPE]]: +// SPIRV-NEXT: fence syncscope("device") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // SPIRV: [[WORKGROUP_SCOPE]]: // SPIRV-NEXT: fence syncscope("workgroup") release // SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// SPIRV: [[CLUSTER_SCOPE]]: +// SPIRV-NEXT: fence syncscope("workgroup") release +// SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // SPIRV: [[WAVEFRONT_SCOPE]]: // SPIRV-NEXT: fence syncscope("subgroup") release // SPIRV-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] @@ -189,23 +197,27 @@ void fe1b(int ord) { // X86_64-NEXT: store i32 [[SCOPE]], ptr [[SCOPE_ADDR]], align 4 // X86_64-NEXT: [[TMP0:%.*]] = load i32, ptr [[SCOPE_ADDR]], align 4 // X86_64-NEXT: switch i32 [[TMP0]], label %[[ATOMIC_SCOPE_CONTINUE:.*]] [ -// X86_64-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // X86_64-NEXT: i32 0, label %[[SYSTEM_SCOPE:.*]] +// X86_64-NEXT: i32 1, label %[[DEVICE_SCOPE:.*]] // X86_64-NEXT: i32 2, label %[[WORKGROUP_SCOPE:.*]] +// X86_64-NEXT: i32 5, label %[[CLUSTER_SCOPE:.*]] // X86_64-NEXT: i32 3, label %[[WAVEFRONT_SCOPE:.*]] // X86_64-NEXT: i32 4, label %[[SINGLE_SCOPE:.*]] // X86_64-NEXT: ] // X86_64: [[ATOMIC_SCOPE_CONTINUE]]: // X86_64-NEXT: ret void -// X86_64: [[DEVICE_SCOPE]]: +// X86_64: [[SYSTEM_SCOPE]]: // X86_64-NEXT: fence release // X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] -// X86_64: [[SYSTEM_SCOPE]]: +// X86_64: [[DEVICE_SCOPE]]: // X86_64-NEXT: fence release // X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // X86_64: [[WORKGROUP_SCOPE]]: // X86_64-NEXT: fence release // X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] +// X86_64: [[CLUSTER_SCOPE]]: +// X86_64-NEXT: fence release +// X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] // X86_64: [[WAVEFRONT_SCOPE]]: // X86_64-NEXT: fence release // X86_64-NEXT: br label %[[ATOMIC_SCOPE_CONTINUE]] @@ -257,3 +269,6 @@ void fe2a() { void fe2b() { __scoped_atomic_thread_fence(__ATOMIC_RELEASE, 999); } +//// NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +// AMDGCN_CL_20: {{.*}} +// AMDGCN_CL_DEF: {{.*}} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl index 6bb20bff436fb..faf6a7d44fee2 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-vi.cl @@ -5,6 +5,8 @@ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1012 -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,GCN %s // RUN: %clang_cc1 -triple spirv64-amd-amdhsa -emit-llvm -o - %s | FileCheck --check-prefixes=CHECK,AMDGCNSPIRV %s +#define INVALID_MEMORY_SCOPE (__MEMORY_SCOPE_CLUSTR+1) + #pragma OPENCL EXTENSION cl_khr_fp16 : enable typedef unsigned long ulong; @@ -252,13 +254,19 @@ void test_update_dpp_const_int(global int* out, int arg1) // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} // CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} -// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}} -// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} -// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} + +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}} +// AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}} -// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} -// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fadd ptr addrspace(3) %out, float %src monotonic, align 4{{$}} #if !defined(__SPIRV__) void test_ds_faddf(local float *out, float src) { #else @@ -279,9 +287,10 @@ void test_ds_faddf(local float *out, float src) { // Test all syncscopes. *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false); *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); - *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, 5, false); // invalid + *out = __builtin_amdgcn_ds_faddf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid } // CHECK-LABEL: @test_ds_fmin @@ -295,13 +304,19 @@ void test_ds_faddf(local float *out, float src) { // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} // CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} -// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}} -// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} -// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} + +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}} +// AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}} -// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} -// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fmin ptr addrspace(3) %out, float %src monotonic, align 4{{$}} #if !defined(__SPIRV__) void test_ds_fminf(local float *out, float src) { @@ -322,9 +337,10 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) { // Test all syncscopes. *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false); *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); - *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, 5, false); // invalid + *out = __builtin_amdgcn_ds_fminf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid } // CHECK-LABEL: @test_ds_fmax @@ -338,13 +354,19 @@ void test_ds_fminf(__attribute__((address_space(3))) float *out, float src) { // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} // CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src seq_cst, align 4{{$}} -// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} +// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("agent") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("device") monotonic, align 4{{$}} -// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} -// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} + +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("cluster") monotonic, align 4{{$}} +// AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("workgroup") monotonic, align 4{{$}} + +// GCN: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("wavefront") monotonic, align 4{{$}} // AMDGCNSPIRV: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("subgroup") monotonic, align 4{{$}} -// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} -// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}} + +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src syncscope("singlethread") monotonic, align 4{{$}} +// CHECK: atomicrmw fmax ptr addrspace(3) %out, float %src monotonic, align 4{{$}} #if !defined(__SPIRV__) void test_ds_fmaxf(local float *out, float src) { @@ -365,9 +387,10 @@ void test_ds_fmaxf(__attribute__((address_space(3))) float *out, float src) { // Test all syncscopes. *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_DEVICE, false); *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WRKGRP, false); + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_CLUSTR, false); *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_WVFRNT, false); *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, __MEMORY_SCOPE_SINGLE, false); - *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, 5, false); // invalid + *out = __builtin_amdgcn_ds_fmaxf(out, src, __ATOMIC_RELAXED, INVALID_MEMORY_SCOPE, false); // invalid } // CHECK-LABEL: @test_s_memtime diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c index 3036b496db25d..460778f39d003 100644 --- a/clang/test/Preprocessor/init-aarch64.c +++ b/clang/test/Preprocessor/init-aarch64.c @@ -234,6 +234,7 @@ // AARCH64-NEXT: #define __LONG_MAX__ 9223372036854775807L // AARCH64-NEXT: #define __LONG_WIDTH__ 64 // AARCH64-NEXT: #define __LP64__ 1 +// AARCH64-NEXT: #define __MEMORY_SCOPE_CLUSTR 5 // AARCH64-NEXT: #define __MEMORY_SCOPE_DEVICE 1 // AARCH64-NEXT: #define __MEMORY_SCOPE_SINGLE 4 // AARCH64-NEXT: #define __MEMORY_SCOPE_SYSTEM 0 @@ -989,6 +990,7 @@ // ARM64EC-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL // ARM64EC-MSVC: #define __LONG_MAX__ 2147483647L // ARM64EC-MSVC: #define __LONG_WIDTH__ 32 +// ARM64EC-MSVC: #define __MEMORY_SCOPE_CLUSTR 5 // ARM64EC-MSVC: #define __MEMORY_SCOPE_DEVICE 1 // ARM64EC-MSVC: #define __MEMORY_SCOPE_SINGLE 4 // ARM64EC-MSVC: #define __MEMORY_SCOPE_SYSTEM 0 diff --git a/clang/test/Preprocessor/init-loongarch.c b/clang/test/Preprocessor/init-loongarch.c index 71a266b8a9157..fd7ce2073a512 100644 --- a/clang/test/Preprocessor/init-loongarch.c +++ b/clang/test/Preprocessor/init-loongarch.c @@ -182,11 +182,12 @@ // LA32: #define __LONG_LONG_MAX__ 9223372036854775807LL // LA32: #define __LONG_MAX__ 2147483647L // LA32: #define __LONG_WIDTH__ 32 -// LA32: #define __MEMORY_SCOPE_DEVICE 1 -// LA32: #define __MEMORY_SCOPE_SINGLE 4 -// LA32: #define __MEMORY_SCOPE_SYSTEM 0 -// LA32: #define __MEMORY_SCOPE_WRKGRP 2 -// LA32: #define __MEMORY_SCOPE_WVFRNT 3 +// LA32: #define __MEMORY_SCOPE_CLUSTR 5 +// LA32: #define __MEMORY_SCOPE_DEVICE 1 +// LA32: #define __MEMORY_SCOPE_SINGLE 4 +// LA32: #define __MEMORY_SCOPE_SYSTEM 0 +// LA32: #define __MEMORY_SCOPE_WRKGRP 2 +// LA32: #define __MEMORY_SCOPE_WVFRNT 3 // LA32: #define __NO_INLINE__ 1 // LA32: #define __NO_MATH_ERRNO__ 1 // LA32: #define __OBJC_BOOL_IS_BOOL 0 @@ -514,11 +515,12 @@ // LA64: #define __LONG_MAX__ 9223372036854775807L // LA64: #define __LONG_WIDTH__ 64 // LA64: #define __LP64__ 1 -// LA64: #define __MEMORY_SCOPE_DEVICE 1 -// LA64: #define __MEMORY_SCOPE_SINGLE 4 -// LA64: #define __MEMORY_SCOPE_SYSTEM 0 -// LA64: #define __MEMORY_SCOPE_WRKGRP 2 -// LA64: #define __MEMORY_SCOPE_WVFRNT 3 +// LA64: #define __MEMORY_SCOPE_CLUSTR 5 +// LA64: #define __MEMORY_SCOPE_DEVICE 1 +// LA64: #define __MEMORY_SCOPE_SINGLE 4 +// LA64: #define __MEMORY_SCOPE_SYSTEM 0 +// LA64: #define __MEMORY_SCOPE_WRKGRP 2 +// LA64: #define __MEMORY_SCOPE_WVFRNT 3 // LA64: #define __NO_INLINE__ 1 // LA64: #define __NO_MATH_ERRNO__ 1 // LA64: #define __OBJC_BOOL_IS_BOOL 0 diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c index 7e0df96141364..4dea1b583a089 100644 --- a/clang/test/Preprocessor/init.c +++ b/clang/test/Preprocessor/init.c @@ -1889,6 +1889,7 @@ // WEBASSEMBLY64-NEXT:#define __LONG_MAX__ 9223372036854775807L // WEBASSEMBLY64-NEXT:#define __LONG_WIDTH__ 64 // WEBASSEMBLY64-NEXT:#define __LP64__ 1 +// WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_CLUSTR 5 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_DEVICE 1 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SINGLE 4 // WEBASSEMBLY-NEXT:#define __MEMORY_SCOPE_SYSTEM 0 @@ -2216,6 +2217,7 @@ // AVR:#define __LDBL_MIN__ 1.17549435e-38L // AVR:#define __LONG_LONG_MAX__ 9223372036854775807LL // AVR:#define __LONG_MAX__ 2147483647L +// AVR:#define __MEMORY_SCOPE_CLUSTR 5 // AVR:#define __MEMORY_SCOPE_DEVICE 1 // AVR:#define __MEMORY_SCOPE_SINGLE 4 // AVR:#define __MEMORY_SCOPE_SYSTEM 0 @@ -2521,6 +2523,7 @@ // RISCV32: #define __LITTLE_ENDIAN__ 1 // RISCV32: #define __LONG_LONG_MAX__ 9223372036854775807LL // RISCV32: #define __LONG_MAX__ 2147483647L +// RISCV32: #define __MEMORY_SCOPE_CLUSTR 5 // RISCV32: #define __MEMORY_SCOPE_DEVICE 1 // RISCV32: #define __MEMORY_SCOPE_SINGLE 4 // RISCV32: #define __MEMORY_SCOPE_SYSTEM 0 @@ -2745,6 +2748,7 @@ // RISCV64: #define __LONG_LONG_MAX__ 9223372036854775807LL // RISCV64: #define __LONG_MAX__ 9223372036854775807L // RISCV64: #define __LP64__ 1 +// RISCV64: #define __MEMORY_SCOPE_CLUSTR 5 // RISCV64: #define __MEMORY_SCOPE_DEVICE 1 // RISCV64: #define __MEMORY_SCOPE_SINGLE 4 // RISCV64: #define __MEMORY_SCOPE_SYSTEM 0 @@ -2937,11 +2941,11 @@ // XTENSA: #define __GXX_ABI_VERSION {{.*}} // XTENSA: #define __ILP32__ 1 // XTENSA: #define __INT16_C(c) c -// XTENSA: #define __INT16_C_SUFFIX__ +// XTENSA: #define __INT16_C_SUFFIX__ // XTENSA: #define __INT16_MAX__ 32767 // XTENSA: #define __INT16_TYPE__ short // XTENSA: #define __INT32_C(c) c -// XTENSA: #define __INT32_C_SUFFIX__ +// XTENSA: #define __INT32_C_SUFFIX__ // XTENSA: #define __INT32_MAX__ 2147483647 // XTENSA: #define __INT32_TYPE__ int // XTENSA: #define __INT64_C(c) c##LL @@ -2949,7 +2953,7 @@ // XTENSA: #define __INT64_MAX__ 9223372036854775807LL // XTENSA: #define __INT64_TYPE__ long long int // XTENSA: #define __INT8_C(c) c -// XTENSA: #define __INT8_C_SUFFIX__ +// XTENSA: #define __INT8_C_SUFFIX__ // XTENSA: #define __INT8_MAX__ 127 // XTENSA: #define __INT8_TYPE__ signed char // XTENSA: #define __INTMAX_C(c) c##LL @@ -3008,6 +3012,7 @@ // XTENSA: #define __LONG_LONG_MAX__ 9223372036854775807LL // XTENSA: #define __LONG_MAX__ 2147483647L // XTENSA: #define __LONG_WIDTH__ 32 +// XTENSA: #define __MEMORY_SCOPE_CLUSTR 5 // XTENSA: #define __MEMORY_SCOPE_DEVICE 1 // XTENSA: #define __MEMORY_SCOPE_SINGLE 4 // XTENSA: #define __MEMORY_SCOPE_SYSTEM 0 @@ -3050,7 +3055,7 @@ // XTENSA: #define __STDC_VERSION__ 201710L // XTENSA: #define __STDC__ 1 // XTENSA: #define __UINT16_C(c) c -// XTENSA: #define __UINT16_C_SUFFIX__ +// XTENSA: #define __UINT16_C_SUFFIX__ // XTENSA: #define __UINT16_MAX__ 65535 // XTENSA: #define __UINT16_TYPE__ unsigned short // XTENSA: #define __UINT32_C(c) c##U @@ -3062,7 +3067,7 @@ // XTENSA: #define __UINT64_MAX__ 18446744073709551615ULL // XTENSA: #define __UINT64_TYPE__ long long unsigned int // XTENSA: #define __UINT8_C(c) c -// XTENSA: #define __UINT8_C_SUFFIX__ +// XTENSA: #define __UINT8_C_SUFFIX__ // XTENSA: #define __UINT8_MAX__ 255 // XTENSA: #define __UINT8_TYPE__ unsigned char // XTENSA: #define __UINTMAX_C(c) c##ULL @@ -3089,7 +3094,7 @@ // XTENSA: #define __UINT_LEAST64_TYPE__ long long unsigned int // XTENSA: #define __UINT_LEAST8_MAX__ 255 // XTENSA: #define __UINT_LEAST8_TYPE__ unsigned char -// XTENSA: #define __USER_LABEL_PREFIX__ +// XTENSA: #define __USER_LABEL_PREFIX__ // XTENSA: #define __WCHAR_MAX__ 2147483647 // XTENSA: #define __WCHAR_TYPE__ int // XTENSA: #define __WCHAR_WIDTH__ 32 diff --git a/clang/test/SemaCUDA/atomic-ops.cu b/clang/test/SemaCUDA/atomic-ops.cu index 233ed1c10fc11..40e110c4b9b77 100644 --- a/clang/test/SemaCUDA/atomic-ops.cu +++ b/clang/test/SemaCUDA/atomic-ops.cu @@ -2,6 +2,8 @@ #include "Inputs/cuda.h" +#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1) + __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) { int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}} val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}} @@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); - val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}} val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); - __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}} __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} @@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); diff --git a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu index ea1f24670ff9a..503e786877819 100644 --- a/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu +++ b/clang/test/SemaCUDA/spirv-amdgcn-atomic-ops.cu @@ -2,6 +2,8 @@ #include "Inputs/cuda.h" +#define INVALID_HIP_MEMORY_SCOPE (__HIP_MEMORY_SCOPE_CLUSTER+1) + __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pll, unsigned long long *pull, float *fp, double *dbl) { int val = __hip_atomic_load(0); // expected-error {{too few arguments to function call, expected 3, have 1}} val = __hip_atomic_load(0, 0, 0, 0); // expected-error {{too many arguments to function call, expected 3, have 4}} @@ -10,9 +12,10 @@ __device__ int test_hip_atomic_load(int *pi32, unsigned int *pu32, long long *pl val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); - val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}} val = __hip_atomic_load(pi32, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); val = __hip_atomic_load(pi32, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); @@ -35,9 +38,10 @@ __device__ int test_hip_atomic_store(int *pi32, unsigned int *pu32, long long *p __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM); - __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, 6); // expected-error {{synchronization scope argument to atomic operation is invalid}} + __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, INVALID_HIP_MEMORY_SCOPE); // expected-error {{synchronization scope argument to atomic operation is invalid}} __hip_atomic_store(pi32, 0, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD); __hip_atomic_store(pi32, 0, __ATOMIC_CONSUME, __HIP_MEMORY_SCOPE_SINGLETHREAD); // expected-warning{{memory order argument to atomic operation is invalid}} @@ -71,6 +75,7 @@ __device__ bool test_hip_atomic_cmpxchg_weak(int *ptr, int val, int desired) { flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_CONSUME, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP); + flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_CLUSTER); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD); flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_SEQ_CST, __HIP_MEMORY_SCOPE_SINGLETHREAD);