diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst index 03cb02deb5e7f..6ad4e76c239d9 100644 --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -5259,6 +5259,8 @@ builtin function, and are named with a ``__opencl_`` prefix. The macros and ``__OPENCL_MEMORY_SCOPE_SUB_GROUP`` are provided, with values corresponding to the enumerators of OpenCL's ``memory_scope`` enumeration.) +.. _langext-__scoped_atomic: + __scoped_atomic builtins ------------------------ @@ -5754,6 +5756,32 @@ returns the bit at the position of the current lane. It is almost equivalent to ``(mask & (1 << lane_id)) != 0``, except that its behavior is only defined if the given mask has the same value for all active lanes of the current wave. + +__builtin_amdgcn_av_{load,store}_b128 +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Signature: + +.. code-block:: c + + typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u; + + v4u __builtin_amdgcn_av_load_b128(v4u *src, int scope); + + void __builtin_amdgcn_av_store_b128(v4u *dst, v4u data, int scope); + +Load or store a vector of 4 unsigned integers from or to memory with cache +behavior specified by ``scope``, which is one of the ``__MEMORY_SCOPE_*`` macros +defined for :ref:`scoped atomic builtins`. + +The pointer argument must point to the global or generic address space. + +These builtins are supported on gfx9, gfx10, gfx11, and gfx12 targets. + +They map to the LLVM intrinsics ``llvm.amdgcn.av.load.b128`` and +``llvm.amdgcn.av.store.b128`` documented in `User Guide for AMDGPU Backend +`_. + ARM/AArch64 Language Extensions ------------------------------- diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td index b15a36df6c08f..a91ad0431d4a6 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.td +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td @@ -280,6 +280,13 @@ def __builtin_amdgcn_raw_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgp def __builtin_amdgcn_struct_ptr_buffer_load_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; def __builtin_amdgcn_struct_ptr_buffer_load_async_lds : AMDGPUBuiltin<"void(__amdgpu_buffer_rsrc_t, void address_space<3> *, _Constant unsigned int, int, int, int, _Constant int, _Constant int)", [], "vmem-to-lds-load-insts">; +//===----------------------------------------------------------------------===// +// Global Available/Visible memory accesses. +//===----------------------------------------------------------------------===// + +def __builtin_amdgcn_av_load_b128: AMDGPUBuiltin<"_ExtVector<4, unsigned int>(_ExtVector<4, unsigned int> *, int)", [], "gfx9-insts">; +def __builtin_amdgcn_av_store_b128: AMDGPUBuiltin<"void(_ExtVector<4, unsigned int> *, _ExtVector<4, unsigned int>, int)", [], "gfx9-insts">; + //===----------------------------------------------------------------------===// // Async mark builtins. //===----------------------------------------------------------------------===// diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index dbe6cb2c3a41c..76bacb7d49c8b 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -14230,7 +14230,7 @@ def note_amdgcn_unguarded_builtin_silence : Note<"enclose %0 in a __builtin_amdgcn_is_invocable check to silence " "this warning">; -def err_amdgcn_coop_atomic_invalid_as : Error<"cooperative atomic requires a global or generic pointer">; +def err_amdgcn_global_or_flat_pointer_required : Error<"builtin requires a global or generic pointer">; def err_amdgcn_dmask_has_too_many_bits_set : Error<"dmask argument cannot have more bits set than there are elements " diff --git a/clang/include/clang/Sema/SemaAMDGPU.h b/clang/include/clang/Sema/SemaAMDGPU.h index d520f3df544f4..a6205534e0de3 100644 --- a/clang/include/clang/Sema/SemaAMDGPU.h +++ b/clang/include/clang/Sema/SemaAMDGPU.h @@ -38,6 +38,7 @@ class SemaAMDGPU : public SemaBase { bool checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, bool MayStore); bool checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore); + bool checkAVLoadStore(CallExpr *TheCall, bool IsStore); bool checkAtomicMonitorLoad(CallExpr *TheCall); bool checkMovDPPFunctionCall(CallExpr *TheCall, unsigned NumArgs, diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp index cb883e8780e59..fc6e397674cd1 100644 --- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp @@ -1012,6 +1012,22 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {Args[0]->getType()}); return Builder.CreateCall(F, {Args}); } + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + case AMDGPU::BI__builtin_amdgcn_av_store_b128: { + const bool IsStore = BuiltinID == AMDGPU::BI__builtin_amdgcn_av_store_b128; + SmallVector Args = {EmitScalarExpr(E->getArg(0))}; // addr + if (IsStore) + Args.push_back(EmitScalarExpr(E->getArg(1))); // data + const unsigned ScopeIdx = E->getNumArgs() - 1; + auto *ScopeExpr = + cast(EmitScalarExpr(E->getArg(ScopeIdx))); + Args.push_back(emitScopeMD(*this, ScopeExpr->getZExtValue())); + llvm::Function *F = + CGM.getIntrinsic(IsStore ? Intrinsic::amdgcn_av_store_b128 + : Intrinsic::amdgcn_av_load_b128, + {Args[0]->getType()}); + return Builder.CreateCall(F, Args); + } case AMDGPU::BI__builtin_amdgcn_get_fpenv: { Function *F = CGM.getIntrinsic(Intrinsic::get_fpenv, {llvm::Type::getInt64Ty(getLLVMContext())}); diff --git a/clang/lib/Sema/SemaAMDGPU.cpp b/clang/lib/Sema/SemaAMDGPU.cpp index 1d2b3898c92d6..f2dbe28b76b5c 100644 --- a/clang/lib/Sema/SemaAMDGPU.cpp +++ b/clang/lib/Sema/SemaAMDGPU.cpp @@ -149,6 +149,10 @@ bool SemaAMDGPU::CheckAMDGCNBuiltinFunctionCall(unsigned BuiltinID, case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_fp6: case AMDGPU::BI__builtin_amdgcn_cvt_scale_pk16_f32_bf6: return SemaRef.BuiltinConstantArgRange(TheCall, 2, 0, 15); + case AMDGPU::BI__builtin_amdgcn_av_load_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/false); + case AMDGPU::BI__builtin_amdgcn_av_store_b128: + return checkAVLoadStore(TheCall, /*IsStore=*/true); case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_32x4B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_16x8B: case AMDGPU::BI__builtin_amdgcn_cooperative_atomic_load_8x16B: @@ -452,19 +456,43 @@ bool SemaAMDGPU::checkAtomicOrderingCABIArg(Expr *E, bool MayLoad, return false; } -bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { - bool Fail = false; - - // First argument is a global or generic pointer. +// Check that the first argument to TheCall is a global or generic pointer. +static bool checkGlobalOrFlatPointerArg(SemaAMDGPU &S, CallExpr *TheCall) { Expr *PtrArg = TheCall->getArg(0); QualType PtrTy = PtrArg->getType()->getPointeeType(); - unsigned AS = getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace()); + unsigned AS = + S.getASTContext().getTargetAddressSpace(PtrTy.getAddressSpace()); if (AS != llvm::AMDGPUAS::FLAT_ADDRESS && - AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) { - Fail = true; - Diag(TheCall->getBeginLoc(), diag::err_amdgcn_coop_atomic_invalid_as) - << PtrArg->getSourceRange(); - } + AS != llvm::AMDGPUAS::GLOBAL_ADDRESS) + return S.Diag(TheCall->getBeginLoc(), + diag::err_amdgcn_global_or_flat_pointer_required) + << PtrArg->getSourceRange(); + return false; +} + +static bool checkScopeAsInt(SemaAMDGPU &S, Expr *Scope) { + if (Scope->isValueDependent()) + return false; + auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic); + if (std::optional Result = + Scope->getIntegerConstantExpr(S.SemaRef.Context)) + if (!ScopeModel->isValid(Result->getZExtValue())) + return S.Diag(Scope->getBeginLoc(), + diag::err_atomic_op_has_invalid_sync_scope) + << Scope->getSourceRange(); + return false; +} + +bool SemaAMDGPU::checkAVLoadStore(CallExpr *TheCall, bool IsStore) { + if (checkGlobalOrFlatPointerArg(*this, TheCall)) + return true; + + Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); + return checkScopeAsInt(*this, Scope); +} + +bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { + bool Fail = checkGlobalOrFlatPointerArg(*this, TheCall); Expr *AO = TheCall->getArg(IsStore ? 2 : 1); Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); @@ -488,27 +516,15 @@ bool SemaAMDGPU::checkCoopAtomicFunctionCall(CallExpr *TheCall, bool IsStore) { } bool SemaAMDGPU::checkAtomicMonitorLoad(CallExpr *TheCall) { - bool Fail = false; - Expr *AO = TheCall->getArg(1); Expr *Scope = TheCall->getArg(TheCall->getNumArgs() - 1); if (AO->isValueDependent() || Scope->isValueDependent()) return false; - Fail |= checkAtomicOrderingCABIArg(TheCall->getArg(1), /*MayLoad=*/true, - /*MayStore=*/false); - - auto ScopeModel = AtomicScopeModel::create(AtomicScopeModelKind::Generic); - if (std::optional Result = - Scope->getIntegerConstantExpr(SemaRef.Context)) { - if (!ScopeModel->isValid(Result->getZExtValue())) { - Diag(Scope->getBeginLoc(), diag::err_atomic_op_has_invalid_sync_scope) - << Scope->getSourceRange(); - Fail = true; - } - } - + bool Fail = checkAtomicOrderingCABIArg(AO, /*MayLoad=*/true, + /*MayStore=*/false); + Fail |= checkScopeAsInt(*this, Scope); return Fail; } diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl new file mode 100644 index 0000000000000..63d7fcac16874 --- /dev/null +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-store.cl @@ -0,0 +1,250 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals smart +// REQUIRES: amdgpu-registered-target +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx900 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1010 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -emit-llvm -o - %s | FileCheck %s + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +//------------------------------------------------------------------------------ +// Global Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META7:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_wave(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META8:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_workgroup(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META9:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_device(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META10:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_system(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META11:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_single(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p1(ptr addrspace(1) [[PTR:%.*]], metadata [[META12:![0-9]+]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_global_load_b128_cluster(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Global Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_global_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_wave(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_workgroup(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_device(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_system(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_single(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_global_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p1(ptr addrspace(1) [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_global_store_b128_cluster(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Load +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_wave(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_workgroup(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_device(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_system(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_single(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_load_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.av.load.b128.p0(ptr [[PTR:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +v4u32 test_amdgcn_flat_load_b128_cluster(v4u32 * ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_CLUSTR); +} + +//------------------------------------------------------------------------------ +// Flat Store +//------------------------------------------------------------------------------ + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_wave( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META7]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_wave(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WVFRNT); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_workgroup( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META8]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_workgroup(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_WRKGRP); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_device( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META9]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_device(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_DEVICE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_system( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META10]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_system(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_single( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META11]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_single(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SINGLE); +} + +// CHECK-LABEL: @test_amdgcn_flat_store_b128_cluster( +// CHECK-NEXT: entry: +// CHECK-NEXT: tail call void @llvm.amdgcn.av.store.b128.p0(ptr [[PTR:%.*]], <4 x i32> [[DATA:%.*]], metadata [[META12]]) +// CHECK-NEXT: ret void +// +void test_amdgcn_flat_store_b128_cluster(v4u32 * ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_CLUSTR); +} +//. +// CHECK: [[META7]] = !{!"wavefront"} +// CHECK: [[META8]] = !{!"workgroup"} +// CHECK: [[META9]] = !{!"agent"} +// CHECK: [[META10]] = !{!""} +// CHECK: [[META11]] = !{!"singlethread"} +// CHECK: [[META12]] = !{!"cluster"} +//. diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl index 8f02e6775d37a..a440a1c040270 100644 --- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl +++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-cooperative-atomics.cl @@ -48,7 +48,7 @@ v4i test_amdgcn_cooperative_atomic_load_8x16B_acq_rel(global v4i* gaddr) void test_amdgcn_cooperative_atomic_store_32x4B__sharedptr(local int* addr, int val) { - __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{cooperative atomic requires a global or generic pointer}} + __builtin_amdgcn_cooperative_atomic_store_32x4B(addr, val, __ATOMIC_RELAXED, ""); // expected-error {{builtin requires a global or generic pointer}} } void test_amdgcn_cooperative_atomic_store_32x4B__ordering_not_imm(local int* addr, int ord, int val) diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl new file mode 100644 index 0000000000000..b2f7b46547632 --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-error.cl @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx950 -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; +typedef v4u32 __private *private_ptr_to_v4u32; + +void test_amdgcn_av_store_b128_bad_ptr(private_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +void test_amdgcn_av_store_b128_bad_scope(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} + +v4u32 test_amdgcn_av_load_b128_bad_ptr(private_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, __MEMORY_SCOPE_SYSTEM); //expected-error{{builtin requires a global or generic pointer}} +} + +v4u32 test_amdgcn_av_load_b128_bad_scope(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 42); //expected-error{{synchronization scope argument to atomic operation is invalid}} +} diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl new file mode 100644 index 0000000000000..e85b120661cfd --- /dev/null +++ b/clang/test/SemaOpenCL/builtins-amdgcn-global-load-store-target-error.cl @@ -0,0 +1,26 @@ +// We test loads and stores separately because clang only seems to exit after +// the first 'target feature' error. + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_LOAD -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_LOAD -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_LOAD -S -verify -o - %s + +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx602 -DTEST_STORE -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx705 -DTEST_STORE -S -verify -o - %s +// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-unknown-unknown -target-cpu gfx810 -DTEST_STORE -S -verify -o - %s +// REQUIRES: amdgpu-registered-target + +typedef __attribute__((__vector_size__(4 * sizeof(unsigned int)))) unsigned int v4u32; +typedef v4u32 __global *global_ptr_to_v4u32; + +#ifdef TEST_LOAD +v4u32 test_amdgcn_av_load_b128_target(global_ptr_to_v4u32 ptr) { + return __builtin_amdgcn_av_load_b128(ptr, 0); // expected-error{{'__builtin_amdgcn_av_load_b128' needs target feature gfx9-insts}} +} +#endif + +#ifdef TEST_STORE +void test_amdgcn_av_store_b128_target(global_ptr_to_v4u32 ptr, v4u32 data) { + __builtin_amdgcn_av_store_b128(ptr, data, 0); // expected-error{{'__builtin_amdgcn_av_store_b128' needs target feature gfx9-insts}} +} +#endif