-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
AMDGPU: Rename intrinsics and remove f16/bf16 versions for load transpose #86313
Conversation
…pose Rename the intrinsics to close to the instruction mnemonic names: Use global_load_re_b64 and global_load_tr_b128 instead of global_load_tr. This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast.
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-clang-codegen Author: Changpeng Fang (changpeng) ChangesRename the intrinsics to close to the instruction mnemonic names: This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast. Patch is 43.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86313.diff 15 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4153b316c22b1d..c660582cc98e66 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -434,13 +434,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
//===----------------------------------------------------------------------===//
// WMMA builtins.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2eaceeba617700..e476234b1379ab 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18533,51 +18533,35 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
}
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
+ Intrinsic::ID IID;
llvm::Type *ArgTy;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt32Ty(getLLVMContext()), 2);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getBFloatTy(getLLVMContext()), 4);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 4);
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 4);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getBFloatTy(getLLVMContext()), 8);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Function *F =
- CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
+ llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 4363769b864571..1e78ab28348682 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -4,24 +4,14 @@
// REQUIRES: amdgpu-registered-target
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
-
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
- global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
- v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
- int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
- v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
- v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
+ int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 208f92fc5d44f3..1acc4cd7adc960 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -3,14 +3,10 @@
// REQUIRES: amdgpu-registered-target
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
-void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr)
{
int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
- v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index 199146a9715da6..96b0e4c3993ab6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -4,14 +4,10 @@
// REQUIRES: amdgpu-registered-target
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
- v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index 0035b16b902b68..126d7d6fb7b053 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -3,13 +3,11 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
@@ -19,30 +17,10 @@ v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
//
v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
{
return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
-//
-v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
-}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]]
-//
-v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
-}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index 6c025bb5a55a36..7c70ccf73ad385 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -2,13 +2,11 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
@@ -18,30 +16,10 @@ int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
//
v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
{
return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
-//
-v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
-}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]]
-//
-v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
-}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index fff03dee20a18b..bda3b066b77636 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2769,17 +2769,14 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
>;
// Wave32
-// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
-// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
+// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64
+// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
-// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
-// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
+// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64
+// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
// i32 @llvm.amdgcn.wave.id()
def int_amdgcn_wave_id :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0037825ce08938..a42e95f140ce99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4889,7 +4889,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_global_load_tr:
+ case Intrinsic::amdgcn_global_load_tr_b64:
+ case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8eb46a9801482c..410dc83d45c57f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -365,7 +365,8 @@ def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;
-def : SourceOfDivergence<int_amdgcn_global_load_tr>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db1f8c1872652b..d017ec4a741510 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1593,16 +1593,12 @@ let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
let WaveSizePredicate = isWave32 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
}
let WaveSizePredicate = isWave64 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64...
[truncated]
|
@llvm/pr-subscribers-clang Author: Changpeng Fang (changpeng) ChangesRename the intrinsics to close to the instruction mnemonic names: This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast. Patch is 43.75 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/86313.diff 15 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 4153b316c22b1d..c660582cc98e66 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -434,13 +434,8 @@ TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_v2i32, "V2iV2i*1", "nc", "gfx12-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8i16, "V8sV8s*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8f16, "V8hV8h*1", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v8bf16, "V8yV8y*1", "nc", "gfx12-insts,wavefrontsize32")
-
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b64_i32, "ii*1", "nc", "gfx12-insts,wavefrontsize64")
TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "gfx12-insts,wavefrontsize64")
//===----------------------------------------------------------------------===//
// WMMA builtins.
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 2eaceeba617700..e476234b1379ab 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18533,51 +18533,35 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
}
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: {
+ Intrinsic::ID IID;
llvm::Type *ArgTy;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_i32:
ArgTy = llvm::Type::getInt32Ty(getLLVMContext());
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b64_v2i32:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt32Ty(getLLVMContext()), 2);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4bf16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getBFloatTy(getLLVMContext()), 4);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4f16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 4);
+ IID = Intrinsic::amdgcn_global_load_tr_b64;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 4);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getBFloatTy(getLLVMContext()), 8);
- break;
- case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16:
- ArgTy = llvm::FixedVectorType::get(
- llvm::Type::getHalfTy(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16:
ArgTy = llvm::FixedVectorType::get(
llvm::Type::getInt16Ty(getLLVMContext()), 8);
+ IID = Intrinsic::amdgcn_global_load_tr_b128;
break;
}
llvm::Value *Addr = EmitScalarExpr(E->getArg(0));
- llvm::Function *F =
- CGM.getIntrinsic(Intrinsic::amdgcn_global_load_tr, {ArgTy});
+ llvm::Function *F = CGM.getIntrinsic(IID, {ArgTy});
return Builder.CreateCall(F, {Addr});
}
case AMDGPU::BI__builtin_amdgcn_get_fpenv: {
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
index 4363769b864571..1e78ab28348682 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx11-err.cl
@@ -4,24 +4,14 @@
// REQUIRES: amdgpu-registered-target
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
-
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr,
- global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global int* int_inptr, global v4s* v4s_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
- v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
- int out_5 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
- v4s out_6 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_7 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
- v4bf16 o8 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
+ int out_3 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
+ v4s out_4 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
index 208f92fc5d44f3..1acc4cd7adc960 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w32-err.cl
@@ -3,14 +3,10 @@
// REQUIRES: amdgpu-registered-target
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
-void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr, global v4h* v4h_inptr, global v4bf16* v4bf16_inptr)
+void amdgcn_global_load_tr(global int* int_inptr, global v4s* v4s_inptr)
{
int out_1 = __builtin_amdgcn_global_load_tr_b64_i32(int_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_i32' needs target feature gfx12-insts,wavefrontsize64}}
v4s out_2 = __builtin_amdgcn_global_load_tr_b128_v4i16(v4s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4i16' needs target feature gfx12-insts,wavefrontsize64}}
- v4h out_3 = __builtin_amdgcn_global_load_tr_b128_v4f16(v4h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4f16' needs target feature gfx12-insts,wavefrontsize64}}
- v4bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v4bf16(v4bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v4bf16' needs target feature gfx12-insts,wavefrontsize64}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
index 199146a9715da6..96b0e4c3993ab6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-gfx12-w64-err.cl
@@ -4,14 +4,10 @@
// REQUIRES: amdgpu-registered-target
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
-void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr, global v8h* v8h_inptr, global v8bf16* v8bf16_inptr)
+void amdgcn_global_load_tr(global v2i* v2i_inptr, global v8s* v8s_inptr)
{
v2i out_1 = __builtin_amdgcn_global_load_tr_b64_v2i32(v2i_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b64_v2i32' needs target feature gfx12-insts,wavefrontsize32}}
v8s out_2 = __builtin_amdgcn_global_load_tr_b128_v8i16(v8s_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8i16' needs target feature gfx12-insts,wavefrontsize32}}
- v8h out_3 = __builtin_amdgcn_global_load_tr_b128_v8f16(v8h_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8f16' needs target feature gfx12-insts,wavefrontsize32}}
- v8bf16 o4 = __builtin_amdgcn_global_load_tr_b128_v8bf16(v8bf16_inptr); // expected-error{{'__builtin_amdgcn_global_load_tr_b128_v8bf16' needs target feature gfx12-insts,wavefrontsize32}}
}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
index 0035b16b902b68..126d7d6fb7b053 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w32.cl
@@ -3,13 +3,11 @@
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
typedef int v2i __attribute__((ext_vector_type(2)));
-typedef half v8h __attribute__((ext_vector_type(8)));
typedef short v8s __attribute__((ext_vector_type(8)));
-typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_v2i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <2 x i32> [[TMP0]]
//
v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
@@ -19,30 +17,10 @@ v2i test_amdgcn_global_load_tr_b64_v2i32(global v2i* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <8 x i16> [[TMP0]]
//
v8s test_amdgcn_global_load_tr_b128_v8i16(global v8s* inptr)
{
return __builtin_amdgcn_global_load_tr_b128_v8i16(inptr);
}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8f16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <8 x half> [[TMP0]]
-//
-v8h test_amdgcn_global_load_tr_b128_v8f16(global v8h* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v8f16(inptr);
-}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v8bf16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <8 x bfloat> [[TMP0]]
-//
-v8bf16 test_amdgcn_global_load_tr_b128_v8bf16(global v8bf16* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v8bf16(inptr);
-}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
index 6c025bb5a55a36..7c70ccf73ad385 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-global-load-tr-w64.cl
@@ -2,13 +2,11 @@
// REQUIRES: amdgpu-registered-target
// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
-typedef half v4h __attribute__((ext_vector_type(4)));
typedef short v4s __attribute__((ext_vector_type(4)));
-typedef __bf16 v4bf16 __attribute__((ext_vector_type(4)));
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b64_i32(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret i32 [[TMP0]]
//
int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
@@ -18,30 +16,10 @@ int test_amdgcn_global_load_tr_b64_i32(global int* inptr)
// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4i16(
// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) [[INPTR:%.*]])
+// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1) [[INPTR:%.*]])
// CHECK-GFX1200-NEXT: ret <4 x i16> [[TMP0]]
//
v4s test_amdgcn_global_load_tr_b128_v4i16(global v4s* inptr)
{
return __builtin_amdgcn_global_load_tr_b128_v4i16(inptr);
}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4f16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <4 x half> [[TMP0]]
-//
-v4h test_amdgcn_global_load_tr_b128_v4f16(global v4h* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v4f16(inptr);
-}
-
-// CHECK-GFX1200-LABEL: @test_amdgcn_global_load_tr_b128_v4bf16(
-// CHECK-GFX1200-NEXT: entry:
-// CHECK-GFX1200-NEXT: [[TMP0:%.*]] = tail call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) [[INPTR:%.*]])
-// CHECK-GFX1200-NEXT: ret <4 x bfloat> [[TMP0]]
-//
-v4bf16 test_amdgcn_global_load_tr_b128_v4bf16(global v4bf16* inptr)
-{
- return __builtin_amdgcn_global_load_tr_b128_v4bf16(inptr);
-}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index fff03dee20a18b..bda3b066b77636 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2769,17 +2769,14 @@ class AMDGPULoadIntrinsic<LLVMType ptr_ty>:
>;
// Wave32
-// <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1)) -> global_load_tr_b64
-// <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1)) -> global_load_tr_b128
-// <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1)) -> global_load_tr_b128
+// <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32(ptr addrspace(1)) -> global_load_tr_b64
+// <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16(ptr addrspace(1)) -> global_load_tr_b128
// Wave64
-// i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1)) -> global_load_tr_b64
-// <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1)) -> global_load_tr_b128
-// <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1)) -> global_load_tr_b128
+// i32 @llvm.amdgcn.global.load.tr.b64.i32(ptr addrspace(1)) -> global_load_tr_b64
+// <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16(ptr addrspace(1)) -> global_load_tr_b128
-def int_amdgcn_global_load_tr : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr_b64 : AMDGPULoadIntrinsic<global_ptr_ty>;
+def int_amdgcn_global_load_tr_b128 : AMDGPULoadIntrinsic<global_ptr_ty>;
// i32 @llvm.amdgcn.wave.id()
def int_amdgcn_wave_id :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 0037825ce08938..a42e95f140ce99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4889,7 +4889,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_flat_atomic_fadd_v2bf16:
case Intrinsic::amdgcn_atomic_cond_sub_u32:
case Intrinsic::amdgcn_global_atomic_ordered_add_b64:
- case Intrinsic::amdgcn_global_load_tr:
+ case Intrinsic::amdgcn_global_load_tr_b64:
+ case Intrinsic::amdgcn_global_load_tr_b128:
return getDefaultMappingAllVGPR(MI);
case Intrinsic::amdgcn_ds_ordered_add:
case Intrinsic::amdgcn_ds_ordered_swap:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8eb46a9801482c..410dc83d45c57f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -365,7 +365,8 @@ def : SourceOfDivergence<intr>;
foreach intr = AMDGPUWMMAIntrinsicsGFX12 in
def : SourceOfDivergence<intr>;
-def : SourceOfDivergence<int_amdgcn_global_load_tr>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr_b64>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr_b128>;
// The dummy boolean output is divergent from the IR's perspective,
// but the mask results are uniform. These produce a divergent and
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index db1f8c1872652b..d017ec4a741510 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1593,16 +1593,12 @@ let OtherPredicates = [isGFX12Plus] in {
defm : GlobalFLATAtomicPatsRtn <"GLOBAL_ATOMIC_ORDERED_ADD_B64", "int_amdgcn_global_atomic_ordered_add_b64", i64, i64, /* isIntr */ 1>;
let WaveSizePredicate = isWave32 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr, v2i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr, v8bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w32, int_amdgcn_global_load_tr_b64, v2i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w32, int_amdgcn_global_load_tr_b128, v8i16>;
}
let WaveSizePredicate = isWave64 in {
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr, i32>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4i16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4f16>;
- defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64, int_amdgcn_global_load_tr, v4bf16>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B64_w64, int_amdgcn_global_load_tr_b64, i32>;
+ defm : GlobalFLATLoadPats <GLOBAL_LOAD_TR_B128_w64...
[truncated]
|
Type global_load_re_b64. |
Changed! Thanks. |
There is no issue in changing the names in principle. Curious, what is the rationale to use more demangled names? |
more user friendly. |
ping |
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v4i16: | ||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8bf16: | ||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8f16: | ||
case AMDGPU::BI__builtin_amdgcn_global_load_tr_b128_v8i16: { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
unnecessary line here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe. But people may have good reason to place it there, and this patch should not touch it.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Rename the intrinsics to close to the instruction mnemonic names:
Use global_load_tr_b64 and global_load_tr_b128 instead of global_load_tr.
This patch also removes f16/bf16 versions of builtins/intrinsics. To simplify the design, we should avoid enumerating all possible types in implementing builtins. We can always use bitcast.