-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4 instructions #152194
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Add gfx1250 wmma_scale[16]_f32_32x16x128_f4 instructions #152194
Conversation
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 91.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152194.diff 13 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5821a69b4de17..b16d4a22207a7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -805,6 +805,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index e450b1415a04b..dad1f95ac710d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -894,6 +894,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
@@ -1172,6 +1174,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
+ break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index af8a457bec686..bdb1a7f0bb32f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -314,6 +314,28 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
}
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 83fdc8c8c60ba..49ef2e571740c 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -230,6 +230,32 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
}
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, int scale_src0, int scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, long scale_src0, long scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+}
+
void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfadc6a58f7f1..90cfd8cedd51b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3956,6 +3956,28 @@ class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
+ Intrinsic<
+ [llvm_anyfloat_ty], // %D
+ [
+ llvm_anyint_ty, // %A
+ llvm_anyint_ty, // %B
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+ LLVMMatchType<0>, // %C
+ llvm_i32_ty, // matrix_a_scale
+ llvm_i32_ty, // matrix_a_scale_fmt
+ scale_ty, // matrix a scale exponential
+ llvm_i32_ty, // matrix_b_scale
+ llvm_i32_ty, // matrix_b_scale_fmt
+ scale_ty, // matrix b scale exponential
+ llvm_i1_ty, // matrix_a_reuse
+ llvm_i1_ty, // matrix_b_reuse
+ ],
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>,
+ ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<11>>,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3984,6 +4006,8 @@ def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i32_ty>;
+def int_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i64_ty>;
}
class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 74230a543ef16..868b1a21e3cd5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4801,6 +4801,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9264935ffad7b..ce280d484da1b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1763,6 +1763,8 @@ def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
+def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
@@ -1852,6 +1854,9 @@ defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
+
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
@@ -2010,6 +2015,8 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>;
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
@@ -2191,6 +2198,15 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
}
+multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+ DecoderNamespace = "GFX1250" in {
+ def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
+ VOP3PX2e <op, LdScaleOp, WMMAP>;
+ }
+}
+
multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
@@ -2292,6 +2308,9 @@ defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x0
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 1c7c625daaa78..1bf865c414279 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -2236,6 +2236,170 @@ bb:
ret void
}
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
+; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
+; GFX1250-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[42:43], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
+; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
+; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false)
+ store <16 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX...
[truncated]
|
@llvm/pr-subscribers-llvm-ir Author: Stanislav Mekhanoshin (rampitec) ChangesPatch is 91.02 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/152194.diff 13 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 5821a69b4de17..b16d4a22207a7 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -805,6 +805,8 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4, "V8fIiV16iIiV
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiiIiIiiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4, "V16fV16iV8iIsV16fIiIiLiIiIiLiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index e450b1415a04b..dad1f95ac710d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -894,6 +894,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
@@ -1172,6 +1174,14 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {3, 0, 1};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4;
+ break;
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
index af8a457bec686..bdb1a7f0bb32f 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -314,6 +314,28 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
}
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i32 [[SCALE_SRC0:%.*]], i32 2, i32 1, i32 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int scale_src0, int scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_scale16_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.scale16.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]], i32 1, i32 2, i64 [[SCALE_SRC0:%.*]], i32 2, i32 1, i64 [[SCALE_SRC1:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, long scale_src0, long scale_src1)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 2, scale_src0, 2, 1, scale_src1, 0, 1);
+}
+
// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
// CHECK-GFX1250-NEXT: entry:
// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
index 83fdc8c8c60ba..49ef2e571740c 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -230,6 +230,32 @@ void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b,
*out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
}
+void test_amdgcn_wmma_scale_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, int scale_src0, int scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale_f32_32x16x128_f4' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_scale16_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod, long scale_src0, long scale_src1, bool reuse)
+{
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, mod, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, mod, 0, scale_src0, 2, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, mod, 0, scale_src1, 0, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, reuse, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 0, reuse); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, mod, scale_src0, 2, 0, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, mod, scale_src1, 1, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, mod, 0); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_scale16_f32_32x16x128_f4(a, b, 0, c, 1, 0, scale_src0, 2, 0, scale_src1, 1, mod); // expected-error {{'__builtin_amdgcn_wmma_scale16_f32_32x16x128_f4' must be a constant integer}}
+}
+
void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
{
*out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index bfadc6a58f7f1..90cfd8cedd51b 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3956,6 +3956,28 @@ class AMDGPUWmmaScaleIntrinsicModsC<LLVMType scale_ty> :
IntrWillReturn, IntrNoCallback, IntrNoFree]
>;
+class AMDGPUWmmaScaleF4IntrinsicModsC<LLVMType scale_ty> :
+ Intrinsic<
+ [llvm_anyfloat_ty], // %D
+ [
+ llvm_anyint_ty, // %A
+ llvm_anyint_ty, // %B
+ llvm_i16_ty, // %C_mod: 0 - none, 1 - neg, 2 - abs, 3 - neg(abs)
+ LLVMMatchType<0>, // %C
+ llvm_i32_ty, // matrix_a_scale
+ llvm_i32_ty, // matrix_a_scale_fmt
+ scale_ty, // matrix a scale exponential
+ llvm_i32_ty, // matrix_b_scale
+ llvm_i32_ty, // matrix_b_scale_fmt
+ scale_ty, // matrix b scale exponential
+ llvm_i1_ty, // matrix_a_reuse
+ llvm_i1_ty, // matrix_b_reuse
+ ],
+ [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<7>>,
+ ImmArg<ArgIndex<8>>, ImmArg<ArgIndex<10>>, ImmArg<ArgIndex<11>>,
+ IntrWillReturn, IntrNoCallback, IntrNoFree]
+>;
+
defset list<Intrinsic> AMDGPUWMMAIntrinsicsGFX1250 = {
def int_amdgcn_wmma_f32_16x16x4_f32 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
def int_amdgcn_wmma_f32_16x16x32_bf16 : AMDGPUWmmaIntrinsicModsAllReuse<llvm_anyfloat_ty, llvm_anyfloat_ty>;
@@ -3984,6 +4006,8 @@ def int_amdgcn_wmma_f32_16x16x128_f8f6f4 : AMDGPUWmmaIntrinsicModsC_MatrixFMT;
def int_amdgcn_wmma_scale_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i32_ty>;
def int_amdgcn_wmma_scale16_f32_16x16x128_f8f6f4 : AMDGPUWmmaScaleIntrinsicModsC<llvm_i64_ty>;
def int_amdgcn_wmma_f32_32x16x128_f4 : AMDGPUWmmaIntrinsicF4ModsC<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_scale_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i32_ty>;
+def int_amdgcn_wmma_scale16_f32_32x16x128_f4 : AMDGPUWmmaScaleF4IntrinsicModsC<llvm_i64_ty>;
}
class AMDGPUSWmmacIntrinsicABIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 74230a543ef16..868b1a21e3cd5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4801,6 +4801,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
case Intrinsic::amdgcn_wmma_scale_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_scale16_f32_16x16x128_f8f6f4:
case Intrinsic::amdgcn_wmma_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale_f32_32x16x128_f4:
+ case Intrinsic::amdgcn_wmma_scale16_f32_32x16x128_f4:
case Intrinsic::amdgcn_swmmac_f16_16x16x64_f16:
case Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16:
case Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16:
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 9264935ffad7b..ce280d484da1b 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1763,6 +1763,8 @@ def F16_FP8BF8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v8i32, v8i32, v8f16
def F16_FP8BF8X128_WMMA_w32 : VOP3PWMMA_Profile<[v8f16, v16i32, v16i32, v8f16], 0, 0, 0, 1, 1, 0, 0, 0, 1>;
def F32_32X16X128_F4_WMMA_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 0, 1, 0, 0, 0, 0, 1>;
def I32_IU8X64_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v8i32, v8i32, v8i32], 0, 0, 1, 0, 1, 0, 0, 0, 1>;
+def F32_32X16X128_F4_SCALE_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 0, 1>;
+def F32_32X16X128_F4_SCALE16_w32 : VOP3PWMMA_Profile<[v16f32, v16i32, v8i32, v16f32], 0, 0, 0, 1, 1, 0, 1, 1, 1>;
def F32_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16f16, v32f16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F32_BF16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v16bf16, v32bf16, v8f32], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
def F16_F16X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f16, v16f16, v32f16, v8f16], 1, 16, 0, 0, 1, 0, 0, 0, 1>;
@@ -1852,6 +1854,9 @@ defm V_SWMMAC_F16_16X16X64_F16_w32 : SWMMACInstGFX12<"v_swmmac_f16_16x16x64
defm V_WMMA_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4">;
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : WMMAInst_SrcFormats_mc<"v_wmma_scale16_f32_16x16x128_f8f6f4", "F32_16X16X128_F8F6F4_SCALE16">;
+
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale_f32_32x16x128_f4", F32_32X16X128_F4_SCALE_w32, "_w32">;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : WMMAInstGFX12<"v_wmma_scale16_f32_32x16x128_f4", F32_32X16X128_F4_SCALE16_w32, "_w32">;
} // End is_wmma_xdl = 1.
defm V_WMMA_LD_SCALE_PAIRED_B32 : VOP3PInst<"v_wmma_ld_scale_paired_b32", VOP_WMMA_LD_SCALE<i32, VCSrc_b32>>;
@@ -2010,6 +2015,8 @@ let SubtargetPredicate = isGFX125xOnly in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_fp8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_16X16X128_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x128_bf8_bf8, F32_FP8BF8X128_WMMA_w32>;
defm : WMMAPat<"V_WMMA_F32_32X16X128_F4_w32", int_amdgcn_wmma_f32_32x16x128_f4, F32_32X16X128_F4_WMMA_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale_f32_32x16x128_f4, F32_32X16X128_F4_SCALE_w32>;
+ defm : WMMAPat<"V_WMMA_SCALE16_F32_32X16X128_F4_w32", int_amdgcn_wmma_scale16_f32_32x16x128_f4, F32_32X16X128_F4_SCALE16_w32>;
foreach I = ["f8_f8", "f8_f6", "f8_f4", "f6_f8", "f6_f6", "f6_f4", "f4_f8", "f4_f6", "f4_f4"] in {
defm : WMMAPat<"V_WMMA_F32_16X16X128_F8F6F4_" # I # "_w32", int_amdgcn_wmma_f32_16x16x128_f8f6f4, !cast<VOP3PWMMA_Profile>("F32_16X16X128_F8F6F4_" # I # "_w32")>;
@@ -2191,6 +2198,15 @@ class VOP3PX2e <bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile P> : Enc128, VO
let Inst{127} = !if(P.NegLo2, src2_modifiers{0}, 0);
}
+multiclass VOP3PX2_Real_ScaledWMMA_F4<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
+ defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
+ let SubtargetPredicate = isGFX1250Plus, WaveSizePredicate = isWave32,
+ DecoderNamespace = "GFX1250" in {
+ def _gfx1250 : VOP3P_Real_Gen<PS, GFX1250Gen, PS.Mnemonic>,
+ VOP3PX2e <op, LdScaleOp, WMMAP>;
+ }
+}
+
multiclass VOP3PX2_Real_ScaledWMMA<bits<8> op, bits<8> LdScaleOp, VOP3PWMMA_Profile WMMAP> {
defvar PS = !cast<VOP3P_Pseudo>(NAME # "_twoaddr");
defvar asmName = !substr(PS.Mnemonic, 0, !sub(!size(PS.Mnemonic), !size("_f8_f8_w32")));
@@ -2292,6 +2308,9 @@ defm V_WMMA_F32_16X16X128_F8F6F4 : VOP3P_Real_WMMA_gfx1250_SrcFormats<0x0
defm V_WMMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x35, "F32_16X16X128_F8F6F4_SCALE">;
defm V_WMMA_SCALE16_F32_16X16X128_F8F6F4 : VOP3PX2_Real_ScaledWMMA_SrcFormats<0x033, 0x3a, "F32_16X16X128_F8F6F4_SCALE16">;
+defm V_WMMA_SCALE_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x35, F32_32X16X128_F4_SCALE_w32>;
+defm V_WMMA_SCALE16_F32_32X16X128_F4_w32 : VOP3PX2_Real_ScaledWMMA_F4<0x088, 0x3a, F32_32X16X128_F4_SCALE16_w32>;
+
defm V_SWMMAC_F32_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x065, F32_F16X64_SWMMAC_w32>;
defm V_SWMMAC_F32_16X16X64_BF16_w32 : VOP3P_Real_WMMA_gfx1250 <0x066, F32_BF16X64_SWMMAC_w32>;
defm V_SWMMAC_F16_16X16X64_F16_w32 : VOP3P_Real_WMMA_gfx1250 <0x067, F16_F16X64_SWMMAC_w32>;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
index 1c7c625daaa78..1bf865c414279 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wmma.gfx1250.w32.ll
@@ -2236,6 +2236,170 @@ bb:
ret void
}
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 %scale_src0, i32 %scale_src1, ptr addrspace(1) %out) {
+; GFX1250-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GFX1250: ; %bb.0: ; %bb
+; GFX1250-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GFX1250-NEXT: s_clause 0x3
+; GFX1250-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
+; GFX1250-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
+; GFX1250-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
+; GFX1250-NEXT: global_store_b128 v[42:43], v[24:27], off
+; GFX1250-NEXT: s_endpgm
+;
+; GISEL-LABEL: test_wmma_scale_f32_32x16x128_f4:
+; GISEL: ; %bb.0: ; %bb
+; GISEL-NEXT: v_wmma_scale_f32_32x16x128_f4 v[24:39], v[0:15], v[16:23], v[24:39], v40, v41 matrix_a_scale:MATRIX_SCALE_ROW1 matrix_b_scale:MATRIX_SCALE_ROW1
+; GISEL-NEXT: s_clause 0x3
+; GISEL-NEXT: global_store_b128 v[42:43], v[24:27], off
+; GISEL-NEXT: global_store_b128 v[42:43], v[28:31], off offset:16
+; GISEL-NEXT: global_store_b128 v[42:43], v[32:35], off offset:32
+; GISEL-NEXT: global_store_b128 v[42:43], v[36:39], off offset:48
+; GISEL-NEXT: s_endpgm
+bb:
+ %res = call <16 x float> @llvm.amdgcn.wmma.scale.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> %A, <8 x i32> %B, i16 0, <16 x float> %C, i32 1, i32 0, i32 %scale_src0, i32 1, i32 0, i32 %scale_src1, i1 false, i1 false)
+ store <16 x float> %res, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_ps void @test_wmma_scale_f32_32x16x128_f4_ss(<16 x i32> %A, <8 x i32> %B, <16 x float> %C, i32 inreg %scale_src0, i32 inreg %scale_src1, ptr addrspace(1) %out) {
+; GFX...
[truncated]
|
…a_scale_16__f32_32x16x128_f4_instructions
…a_scale_16__f32_32x16x128_f4_instructions
…a_scale_16__f32_32x16x128_f4_instructions
…a_scale_16__f32_32x16x128_f4_instructions
…a_scale_16__f32_32x16x128_f4_instructions
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/3/builds/20129 Here is the relevant piece of the build log for the reference
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/66/builds/17543 Here is the relevant piece of the build log for the reference
|
No description provided.