diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def index 61ec8b79bf054..4153b316c22b1 100644 --- a/clang/include/clang/Basic/BuiltinsAMDGPU.def +++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def @@ -432,13 +432,15 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts") TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") - -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") -TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32") + +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64") +TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64") //===----------------------------------------------------------------------===// // WMMA builtins. diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index e14e890882821..2eaceeba61770 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -18531,35 +18531,45 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID, llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy}); return Builder.CreateCall(F, {Addr, Val, ZeroI32, ZeroI32, ZeroI1}); } - case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16: - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: { + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: { llvm::Type *ArgTy; switch (BuiltinID) { - case AMDGPU::BI__builtin_amdgcn_global_load_tr_i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32: ArgTy = llvm::Type::getInt32Ty(getLLVMContext()); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v2i32: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt32Ty(getLLVMContext()), 2); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getBFloatTy(getLLVMContext()), 4); + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getHalfTy(getLLVMContext()), 4); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v4i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt16Ty(getLLVMContext()), 4); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8f16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: + ArgTy = llvm::FixedVectorType::get( + llvm::Type::getBFloatTy(getLLVMContext()), 8); + break; + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getHalfTy(getLLVMContext()), 8); break; - case AMDGPU::BI__builtin_amdgcn_global_load_tr_v8i16: + case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: ArgTy = llvm::FixedVectorType::get( llvm::Type::getInt16Ty(getLLVMContext()), 8); break; diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl index f7afb7cb97eda..4363769b86457 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl @@ -6,21 +6,22 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); - - -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, - global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr, + global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr) { - v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}} - v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}} - - int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} + + int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } - diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl index 04ac0a66db7ce..208f92fc5d44f 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl @@ -5,11 +5,12 @@ typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); -void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr) +void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr) { - int out_4 = __builtin_amdgcn_global_load_tr_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_i32' needs target feature gfx12-insts,wavefrontsize64}} - v4s out_5 = __builtin_amdgcn_global_load_tr_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4i16' needs target feature gfx12-insts,wavefrontsize64}} - v4h out_6 = __builtin_amdgcn_global_load_tr_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}} + v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}} + v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}} + v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}} } - diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl index 113b54b853a9f..199146a9715da 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl @@ -6,11 +6,12 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); -void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr) +void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr) { - v2i out_1 = __builtin_amdgcn_global_load_tr_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v2i32' needs target feature gfx12-insts,wavefrontsize32}} - v8s out_2 = __builtin_amdgcn_global_load_tr_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8i16' needs target feature gfx12-insts,wavefrontsize32}} - v8h out_3 = __builtin_amdgcn_global_load_tr_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}} + v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}} + v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}} + v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}} } - diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl index b5fcad68a4702..0035b16b902b6 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl @@ -5,44 +5,44 @@ typedef int v2i __attribute__((ext_vector_type(2))); typedef half v8h __attribute__((ext_vector_type(8))); typedef short v8s __attribute__((ext_vector_type(8))); +typedef __bf16 v8bf16 __attribute__((ext_vector_type(8))); -// Wave32 - -// -// amdgcn_global_load_tr -// - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v2i32( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]] // -v2i test_amdgcn_global_load_tr_v2i32(global v2i* inptr) +v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr) { - return __builtin_amdgcn_global_load_tr_v2i32(inptr); + return __builtin_amdgcn_global_load_tr_b64_v2i32(inptr); } -// -// amdgcn_global_load_tr -// - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8i16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]] // -v8s test_amdgcn_global_load_tr_v8i16(global v8s* inptr) +v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr) { - return __builtin_amdgcn_global_load_tr_v8i16(inptr); + return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr); } -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v8f16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]] // -v8h test_amdgcn_global_load_tr_v8f16(global v8h* inptr) +v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr) { - return __builtin_amdgcn_global_load_tr_v8f16(inptr); + return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr); } +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]] +// +v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr); +} diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl index 9c48ac071b4d3..6c025bb5a55a3 100644 --- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl @@ -4,44 +4,44 @@ typedef half v4h __attribute__((ext_vector_type(4))); typedef short v4s __attribute__((ext_vector_type(4))); +typedef __bf16 v4bf16 __attribute__((ext_vector_type(4))); -// Wave64 - -// -// amdgcn_global_load_tr -// - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_i32( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret i32 [[TMP0]] // -int test_amdgcn_global_load_tr_i32(global int* inptr) +int test_amdgcn_global_load_tr_b64_i32(global int* inptr) { - return __builtin_amdgcn_global_load_tr_i32(inptr); + return __builtin_amdgcn_global_load_tr_b64_i32(inptr); } -// -// amdgcn_global_load_tr -// - -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4i16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]] // -v4s test_amdgcn_global_load_tr_v4i16(global v4s* inptr) +v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr) { - return __builtin_amdgcn_global_load_tr_v4i16(inptr); + return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr); } -// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_v4f16( +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16( // CHECK-GFX1200-NEXT: entry: // CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]]) // CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]] // -v4h test_amdgcn_global_load_tr_v4f16(global v4h* inptr) +v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr) { - return __builtin_amdgcn_global_load_tr_v4f16(inptr); + return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr); } +// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16( +// CHECK-GFX1200-NEXT: entry: +// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]]) +// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]] +// +v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr) +{ + return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr); +}