[AMDGPU] Add GFX12 WMMA and SWMMAC instructions #77795

mbrkusanin · 2024-01-11T16:27:28Z

No description provided.

mbrkusanin · 2024-01-11T16:27:37Z

~~Note that the first commit in this PR is: #77785~~ (merged and removed from this PR)

llvmbot · 2024-01-11T16:27:57Z

@llvm/pr-subscribers-mlir-llvm
@llvm/pr-subscribers-mlir
@llvm/pr-subscribers-llvm-globalisel
@llvm/pr-subscribers-llvm-ir
@llvm/pr-subscribers-backend-amdgpu

@llvm/pr-subscribers-clang-codegen

Author: Mirko Brkušanin (mbrkusanin)

Changes

Patch is 1002.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77795.diff

62 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+61)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+159-13)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl (+156)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl (+155)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl (+135)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl (+134)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl (+8-9)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl (+8-10)
(added) cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl (+107)
(added) cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl (+104)
(added) cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl (+110)
(added) cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl (+109)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+93-28)
(modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+27-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+331-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+11-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+214-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+14-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+23)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+16)
(modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+16)
(modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+141-5)
(modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+8)
(modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+5-3)
(modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+37)
(modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h (+4)
(modified) llvm/lib/Target/AMDGPU/SIDefines.h (+3)
(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-1)
(modified) llvm/lib/Target/AMDGPU/SIInstrFormats.td (+5)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+8)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+12-1)
(modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+5)
(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+498-10)
(modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+3)
(modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+118-18)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll (+504)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll (+519)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll (+309)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll (+321)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll (+370)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll (+459)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll (+430)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll (+274)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll (+472)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll (+333)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll (+499)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll (+431)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll (+309)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll (+321)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll (+370)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll (+456)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll (+373)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll (+274)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll (+472)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll (+333)
(added) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir (+358)
(added) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir (+359)
(added) llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s (+1529)
(added) llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s (+1529)
(added) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt (+1628)
(added) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt (+1628)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..026c0af65c92bb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -423,6 +423,67 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//
+// Some of these are very similar to their GFX11 counterparts, but they don't
+// require replication of the A,B matrices, so they use fewer vector elements.
+// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
+// builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 998fcc3af58175..c588b32f698bf5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18240,65 +18240,211 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
-  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
     //             D = A * B + C
-    // The return type always matches the type of matrix C.
-    unsigned ArgForMatchingRetType;
+    // We need to specify one type for matrices AB and one for matrices CD.
+    SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
+    // Some intrinsics expect "false" as an extra bool argument.
+    bool AppendExtraBoolArg = false;
     unsigned BuiltinWMMAOp;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+      AppendExtraBoolArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+      AppendExtraBoolArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
       Args.push_back(EmitScalarExpr(E->getArg(i)));
+    if (AppendExtraBoolArg)
+      Args.push_back(Builder.getFalse());
 
-    Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
-                                   {Args[ArgForMatchingRetType]->getType()});
+    SmallVector<llvm::Type *, 6> ArgTypes;
+    for (auto ArgIdx : ArgsForMatchingMatrixTypes)
+      ArgTypes.push_back(Args[ArgIdx]->getType());
 
+    Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
     return Builder.CreateCall(F, Args);
   }
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..a5d8bb34a7842d
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f...
[truncated]

llvmbot · 2024-01-11T16:27:57Z

@llvm/pr-subscribers-mc

Author: Mirko Brkušanin (mbrkusanin)

Changes

Patch is 1002.09 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/77795.diff

62 Files Affected:

(modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+61)
(modified) clang/lib/CodeGen/CGBuiltin.cpp (+159-13)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl (+156)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl (+155)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl (+135)
(added) clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl (+134)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl (+8-9)
(modified) clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl (+8-10)
(added) cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl (+107)
(added) cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl (+104)
(added) cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl (+110)
(added) cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl (+109)
(modified) llvm/include/llvm/IR/IntrinsicsAMDGPU.td (+93-28)
(modified) llvm/lib/Target/AMDGPU/AMDGPUGISel.td (+27-3)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp (+331-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h (+11-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp (+214-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h (+14-1)
(modified) llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp (+23)
(modified) llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp (+16)
(modified) llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td (+16)
(modified) llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp (+141-5)
(modified) llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp (+8)
(modified) llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp (+5-3)
(modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp (+37)
(modified) llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h (+4)
(modified) llvm/lib/Target/AMDGPU/SIDefines.h (+3)
(modified) llvm/lib/Target/AMDGPU/SIFoldOperands.cpp (+1)
(modified) llvm/lib/Target/AMDGPU/SIISelLowering.cpp (+31-1)
(modified) llvm/lib/Target/AMDGPU/SIInstrFormats.td (+5)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.h (+8)
(modified) llvm/lib/Target/AMDGPU/SIInstrInfo.td (+12-1)
(modified) llvm/lib/Target/AMDGPU/SIRegisterInfo.td (+5)
(modified) llvm/lib/Target/AMDGPU/VOP3PInstructions.td (+498-10)
(modified) llvm/lib/Target/AMDGPU/VOPInstructions.td (+3)
(modified) llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll (+118-18)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll (+504)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll (+519)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll (+309)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll (+321)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll (+370)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll (+459)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll (+430)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll (+274)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll (+472)
(added) llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll (+333)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll (+499)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll (+431)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll (+309)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll (+321)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll (+370)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll (+456)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll (+373)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll (+274)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll (+472)
(added) llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll (+333)
(added) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir (+358)
(added) llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir (+359)
(added) llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s (+1529)
(added) llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s (+1529)
(added) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt (+1628)
(added) llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt (+1628)

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index e562ef04a30194..026c0af65c92bb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -423,6 +423,67 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wakeup_barrier, "vi", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_barrier_leave, "b", "n", "gfx12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_s_get_barrier_state, "Uii", "n", "gfx12-insts")
 
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//
+// Some of these are very similar to their GFX11 counterparts, but they don't
+// require replication of the A,B matrices, so they use fewer vector elements.
+// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
+// builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
 
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 998fcc3af58175..c588b32f698bf5 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18240,65 +18240,211 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
-  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
     //             D = A * B + C
-    // The return type always matches the type of matrix C.
-    unsigned ArgForMatchingRetType;
+    // We need to specify one type for matrices AB and one for matrices CD.
+    SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
+    // Some intrinsics expect "false" as an extra bool argument.
+    bool AppendExtraBoolArg = false;
     unsigned BuiltinWMMAOp;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+      AppendExtraBoolArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+      AppendExtraBoolArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {0, 2};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {0, 2};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {1, 4};
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+      ArgsForMatchingMatrixTypes = {1, 3, 4, 5};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {0, 1, 2, 3};
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
       Args.push_back(EmitScalarExpr(E->getArg(i)));
+    if (AppendExtraBoolArg)
+      Args.push_back(Builder.getFalse());
 
-    Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
-                                   {Args[ArgForMatchingRetType]->getType()});
+    SmallVector<llvm::Type *, 6> ArgTypes;
+    for (auto ArgIdx : ArgsForMatchingMatrixTypes)
+      ArgTypes.push_back(Args[ArgIdx]->getType());
 
+    Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
     return Builder.CreateCall(F, Args);
   }
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 00000000000000..a5d8bb34a7842d
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f...
[truncated]

arsenm · 2024-01-12T07:27:19Z

clang/include/clang/Basic/BuiltinsAMDGPU.def

+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")


the signed-unsigned fusion with control bit is a bit ugly, would have been nicer to have separate signed and unsigned variants with the types changed. I suppose this was already the mistake made with the gfx11 builtins though

clang/lib/CodeGen/CGBuiltin.cpp

arsenm · 2024-01-17T07:14:16Z

clang/include/clang/Basic/BuiltinsAMDGPU.def

+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")


Can we switch new bf16 types to use the natural __bf16?

Updated to bfloat but GlobalISel does not handle it properly yet. Should we use i16 for now until we update GlobalISel?

jayfoad · 2024-01-19T11:17:59Z

Some of the tests in this patch need regenerating now that #77438 has been merged.

mbrkusanin · 2024-01-19T19:35:47Z

Rebased.

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td

mbrkusanin · 2024-01-22T17:41:45Z

If there are no further comments, should I merge this?

mbrkusanin · 2024-01-23T11:53:24Z

Ping

arsenm · 2024-01-23T15:08:22Z

llvm/include/llvm/IR/IntrinsicsAMDGPU.td

+//        The content of the other 16-bit half is undefined.
+// GFX12: The op_sel bit must be 0.
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_any_ty, llvm_any_ty>;


Why is this using any_Ty? Should just be the one?

Sizes are halved. GFX11 basically contained same matrix twice.

This is how intrinsics look like at the moment:
gfx11:
declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg)
gfx12:
declare <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x16.bf16(<8 x bfloat>, <8 x bfloat>, <8 x bfloat>, i1 immarg)

This is kind of horrible. It's not at all clear you're supposed to use one type for one target and a different one for another. I wonder if they should just be renamed?

I suggest we take a step back, and push the previous version of the patch where the bf16 intrinsics used i16 for consistency with gfx11.

Then, in a follow-up commit we will add new bf16 intrinsics with the proper bfloat type (I realize the naming could be contentious, but we could discuss it in the follow-up review).

mbrkusanin · 2024-01-24T10:19:56Z

Rebased and reverted bfloat

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

This reverts commit f825890.

mbrkusanin · 2024-01-24T11:56:19Z

Rebased and updated after #76143

anlunx · 2024-01-25T00:58:36Z

Also need to be updated:

llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td

Line 14 in bb6a485

def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;

jayfoad · 2024-01-25T07:40:05Z

Also need to be updated:

llvm-project/llvm/lib/Target/AMDGPU/SMInstructions.td

Line 14 in bb6a485

def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;

What needs to be updated and why?

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com> Change-Id: I6ab1132823033fb047665f3a527cff748ff69589

llvmbot added clang Clang issues not falling into any other category backend:AMDGPU clang:frontend Language frontend issues, e.g. anything involving "Sema" clang:codegen mc Machine (object) code llvm:globalisel llvm:ir llvm:analysis labels Jan 11, 2024

arsenm reviewed Jan 12, 2024

View reviewed changes

piotrAMD reviewed Jan 12, 2024

View reviewed changes

clang/lib/CodeGen/CGBuiltin.cpp Outdated Show resolved Hide resolved

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from b83f246 to ddb56d9 Compare January 12, 2024 10:26

arsenm reviewed Jan 17, 2024

View reviewed changes

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from ddb56d9 to ca8448d Compare January 18, 2024 12:26

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from ca8448d to 86dfbca Compare January 19, 2024 19:34

llvmbot added mlir:llvm mlir labels Jan 19, 2024

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from 86dfbca to 632ad09 Compare January 22, 2024 08:39

krzysz00 reviewed Jan 22, 2024

View reviewed changes

mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td Show resolved Hide resolved

mbrkusanin force-pushed the gfx12-wmma-swmmac branch 2 times, most recently from 46509ac to 732186b Compare January 22, 2024 17:08

arsenm reviewed Jan 23, 2024

View reviewed changes

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from 732186b to 0b31720 Compare January 24, 2024 10:19

piotrAMD approved these changes Jan 24, 2024

View reviewed changes

mbrkusanin and others added 10 commits January 24, 2024 12:46

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions

6541b99

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

rename AppendExtraBoolArg

4a37160

Use bfloat

d0905f9

Adjust overload types between clang and mlir

50b642f

update tests

cead267

also reorder types for swmmac, dst before src types

d8e9aa3

Restore Traits argument

a9db694

actually use traits

d0ea16e

Revert "Use bfloat"

fd460a7

This reverts commit f825890.

update GCNHazardRecognizer

d3f9c4f

mbrkusanin force-pushed the gfx12-wmma-swmmac branch from 0b31720 to d3f9c4f Compare January 24, 2024 11:55

mbrkusanin merged commit 7fdf608 into llvm:main Jan 24, 2024
3 of 4 checks passed

mbrkusanin deleted the gfx12-wmma-swmmac branch January 24, 2024 12:43

mbrkusanin mentioned this pull request Jan 24, 2024

Backport 7fdf608 to release/18.x #79293

Closed

mbrkusanin added a commit that referenced this pull request Jan 25, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)

eafe7dd

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

mbrkusanin added a commit to mbrkusanin/llvm-project that referenced this pull request Jan 26, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (llvm#77795)

e094398

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

tstellar pushed a commit that referenced this pull request Jan 27, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (#77795)

ed48280

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

tstellar pushed a commit to tstellar/llvm-project that referenced this pull request Feb 14, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (llvm#77795)

1bfcd19

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

tstellar pushed a commit to tstellar/llvm-project that referenced this pull request Feb 14, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (llvm#77795)

b2ffde0

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

tstellar pushed a commit to tstellar/llvm-project that referenced this pull request Feb 14, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (llvm#77795)

0e73ace

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

tstellar pushed a commit to tstellar/llvm-project that referenced this pull request Feb 14, 2024

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions (llvm#77795)

da0b7ae

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com> Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>

piotrAMD mentioned this pull request Mar 8, 2024

[AMDGPU] Replace isInlinableLiteral16 with specific version #84402

Merged

pointhex mentioned this pull request May 7, 2024

getStyleDiagHandler #91314

Closed

aemerson mentioned this pull request May 9, 2024

release/18.x: [AArc64][GlobalISel] Fix legalizer assert for G_INSERT_VECTOR_ELT - manual merge #91672

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions #77795

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions #77795

mbrkusanin commented Jan 11, 2024

mbrkusanin commented Jan 11, 2024 •

edited

llvmbot commented Jan 11, 2024 •

edited

llvmbot commented Jan 11, 2024

arsenm Jan 12, 2024

arsenm Jan 17, 2024

mbrkusanin Jan 18, 2024

jayfoad commented Jan 19, 2024

mbrkusanin commented Jan 19, 2024

mbrkusanin commented Jan 22, 2024

mbrkusanin commented Jan 23, 2024

arsenm Jan 23, 2024

mbrkusanin Jan 23, 2024

arsenm Jan 24, 2024

piotrAMD Jan 24, 2024

mbrkusanin commented Jan 24, 2024

mbrkusanin commented Jan 24, 2024

anlunx commented Jan 25, 2024

jayfoad commented Jan 25, 2024

		TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
		TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions #77795

[AMDGPU] Add GFX12 WMMA and SWMMAC instructions #77795

Conversation

mbrkusanin commented Jan 11, 2024

mbrkusanin commented Jan 11, 2024 • edited

llvmbot commented Jan 11, 2024 • edited

llvmbot commented Jan 11, 2024

arsenm Jan 12, 2024

Choose a reason for hiding this comment

arsenm Jan 17, 2024

Choose a reason for hiding this comment

mbrkusanin Jan 18, 2024

Choose a reason for hiding this comment

jayfoad commented Jan 19, 2024

mbrkusanin commented Jan 19, 2024

mbrkusanin commented Jan 22, 2024

mbrkusanin commented Jan 23, 2024

arsenm Jan 23, 2024

Choose a reason for hiding this comment

mbrkusanin Jan 23, 2024

Choose a reason for hiding this comment

arsenm Jan 24, 2024

Choose a reason for hiding this comment

piotrAMD Jan 24, 2024

Choose a reason for hiding this comment

mbrkusanin commented Jan 24, 2024

mbrkusanin commented Jan 24, 2024

anlunx commented Jan 25, 2024

jayfoad commented Jan 25, 2024

mbrkusanin commented Jan 11, 2024 •

edited

llvmbot commented Jan 11, 2024 •

edited