diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td index 051e603c0819d..fff03dee20a18 100644 --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -2653,6 +2653,8 @@ class AMDGPUWmmaIntrinsicIU : // The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit. // The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers. // The content of the other 16-bit half is preserved from the input. + +defset list AMDGPUWMMAIntrinsicsGFX11 = { def int_amdgcn_wmma_f16_16x16x16_f16_tied : AMDGPUWmmaIntrinsicOPSEL; def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL; @@ -2668,6 +2670,7 @@ def int_amdgcn_wmma_i32_16x16x16_iu4 : AMDGPUWmmaIntrinsicIU; def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL; +} //===----------------------------------------------------------------------===// // GFX12 Intrinsics @@ -2687,20 +2690,6 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var" [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg>, ImmArg>, IntrNoCallback, IntrNoFree]>; - -// WMMA (Wave Matrix Multiply-Accumulate) intrinsics -// -// These operations perform a matrix multiplication and accumulation of -// the form: D = A * B + C . - -// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. -def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic; -def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic; -// A and B are <16 x iu4>. -def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU; - // SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics // // These operations perform a sparse matrix multiplication and accumulation of @@ -2734,6 +2723,20 @@ class AMDGPUSWmmacIntrinsicIUIdx>, ImmArg>, ImmArg>] >; +defset list AMDGPUWMMAIntrinsicsGFX12 = { +// WMMA (Wave Matrix Multiply-Accumulate) intrinsics +// +// These operations perform a matrix multiplication and accumulation of +// the form: D = A * B + C . + +// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>. +def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic; +def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic; +// A and B are <16 x iu4>. +def int_amdgcn_wmma_i32_16x16x32_iu4 : AMDGPUWmmaIntrinsicIU; + def int_amdgcn_swmmac_f32_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; def int_amdgcn_swmmac_f32_16x16x32_bf16 : AMDGPUSWmmacIntrinsicIdx; def int_amdgcn_swmmac_f16_16x16x32_f16 : AMDGPUSWmmacIntrinsicIdx; @@ -2745,6 +2748,7 @@ def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx; def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx; def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx; +} def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn; @@ -3012,6 +3016,7 @@ class AMDGPUMfmaIntrinsic : [IntrConvergent, IntrNoMem, ImmArg>, ImmArg>, ImmArg>]>; +defset list AMDGPUMFMAIntrinsics908 = { def int_amdgcn_mfma_f32_32x32x1f32 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x1f32 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_4x4x1f32 : AMDGPUMfmaIntrinsic; @@ -3032,6 +3037,7 @@ def int_amdgcn_mfma_f32_16x16x2bf16 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_32x32x4bf16 : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x8bf16 : AMDGPUMfmaIntrinsic; +} //===----------------------------------------------------------------------===// // gfx90a intrinsics @@ -3043,6 +3049,7 @@ def int_amdgcn_flat_atomic_fadd : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmin : AMDGPUAtomicRtn; def int_amdgcn_flat_atomic_fmax : AMDGPUAtomicRtn; +defset list AMDGPUMFMAIntrinsics90A = { def int_amdgcn_mfma_f32_32x32x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_16x16x4bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f32_4x4x4bf16_1k : AMDGPUMfmaIntrinsic; @@ -3054,25 +3061,12 @@ def int_amdgcn_mfma_f32_16x16x16bf16_1k : AMDGPUMfmaIntrinsic; def int_amdgcn_mfma_f64_4x4x4f64 : AMDGPUMfmaIntrinsic; +} //===----------------------------------------------------------------------===// // gfx940 intrinsics // ===----------------------------------------------------------------------===// -// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. -def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; -def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; -def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< - [llvm_v2i16_ty], - [LLVMQualPointerType<3>, llvm_v2i16_ty], - [IntrArgMemOnly, NoCapture>]>, - ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; - -def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; -def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic; -def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic; -def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic; - class AMDGPUMFp8MfmaIntrinsic : AMDGPUMfmaIntrinsic; @@ -3081,9 +3075,6 @@ multiclass AMDGPUMFp8MfmaIntrinsic { def NAME#"_"#kind : AMDGPUMFp8MfmaIntrinsic; } -defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic; -defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic; - // llvm.amdgcn.smfmac.?32.* vdst, srcA, srcB, srcC, index, cbsz, abid class AMDGPUMSmfmacIntrinsic : ClangBuiltin, @@ -3093,13 +3084,6 @@ class AMDGPUMSmfmacIntrinsic : [IntrConvergent, IntrNoMem, ImmArg>, ImmArg>]>; -def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic; -def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic; -def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic; -def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic; -def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic; -def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic; - class AMDGPUMFp8SmfmacIntrinsic : AMDGPUMSmfmacIntrinsic; @@ -3108,8 +3092,34 @@ multiclass AMDGPUMFp8SmfmacIntrinsic { def NAME#"_"#kind : AMDGPUMFp8SmfmacIntrinsic; } +// bf16 atomics use v2i16 argument since there is no bf16 data type in the llvm. +def int_amdgcn_global_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; +def int_amdgcn_flat_atomic_fadd_v2bf16 : AMDGPUAtomicRtn; +def int_amdgcn_ds_fadd_v2bf16 : DefaultAttrsIntrinsic< + [llvm_v2i16_ty], + [LLVMQualPointerType<3>, llvm_v2i16_ty], + [IntrArgMemOnly, NoCapture>]>, + ClangBuiltin<"__builtin_amdgcn_ds_atomic_fadd_v2bf16">; + +defset list AMDGPUMFMAIntrinsics940 = { +def int_amdgcn_mfma_i32_16x16x32_i8 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_i32_32x32x16_i8 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_f32_16x16x8_xf32 : AMDGPUMfmaIntrinsic; +def int_amdgcn_mfma_f32_32x32x4_xf32 : AMDGPUMfmaIntrinsic; + +defm int_amdgcn_mfma_f32_16x16x32 : AMDGPUMFp8MfmaIntrinsic; +defm int_amdgcn_mfma_f32_32x32x16 : AMDGPUMFp8MfmaIntrinsic; + +def int_amdgcn_smfmac_f32_16x16x32_f16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_32x32x16_f16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_16x16x32_bf16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_f32_32x32x16_bf16 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_i32_16x16x64_i8 : AMDGPUMSmfmacIntrinsic; +def int_amdgcn_smfmac_i32_32x32x32_i8 : AMDGPUMSmfmacIntrinsic; + defm int_amdgcn_smfmac_f32_16x16x64 : AMDGPUMFp8SmfmacIntrinsic; defm int_amdgcn_smfmac_f32_32x32x32 : AMDGPUMFp8SmfmacIntrinsic; +} // llvm.amdgcn.cvt.f32.bf8 float vdst, int srcA, imm byte_sel [0..3] // byte_sel selects byte from srcA. diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td index bb1c6b7337299..8eb46a9801482 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td +++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td @@ -354,82 +354,17 @@ def : SourceOfDivergence; def : SourceOfDivergence; def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; -def : SourceOfDivergence; +foreach intr = AMDGPUMFMAIntrinsics908 in +def : SourceOfDivergence; +foreach intr = AMDGPUMFMAIntrinsics90A in +def : SourceOfDivergence; +foreach intr = AMDGPUMFMAIntrinsics940 in +def : SourceOfDivergence; +foreach intr = AMDGPUWMMAIntrinsicsGFX11 in +def : SourceOfDivergence; +foreach intr = AMDGPUWMMAIntrinsicsGFX12 in +def : SourceOfDivergence; + def : SourceOfDivergence; // The dummy boolean output is divergent from the IR's perspective,