diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp index b25619b4c5422..1ff7589339a14 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUInsertDelayAlu.cpp @@ -16,6 +16,7 @@ #include "MCTargetDesc/AMDGPUMCTargetDesc.h" #include "SIInstrInfo.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/MachineInstr.h" using namespace llvm; @@ -51,12 +52,15 @@ class AMDGPUInsertDelayAlu { enum DelayType { VALU, TRANS, SALU, OTHER }; // Get the delay type for an instruction with the specified TSFlags. - static DelayType getDelayType(uint64_t TSFlags) { - if (TSFlags & SIInstrFlags::TRANS) + DelayType getDelayType(MachineInstr &MI) { + if (SIInstrInfo::isTRANS(MI)) return TRANS; - if (TSFlags & SIInstrFlags::VALU) + // XDL ops are treated the same as TRANS. + if (SII->isXDL(MI)) + return TRANS; + if (SIInstrInfo::isVALU(MI)) return VALU; - if (TSFlags & SIInstrFlags::SALU) + if (SIInstrInfo::isSALU(MI)) return SALU; return OTHER; } @@ -343,7 +347,7 @@ class AMDGPUInsertDelayAlu { continue; } - DelayType Type = getDelayType(MI.getDesc().TSFlags); + DelayType Type = getDelayType(MI); if (instructionWaitsForVALU(MI)) { // Forget about all outstanding VALU delays. diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ae285d069d876..5095cecc8732f 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -10262,6 +10262,8 @@ bool SIInstrInfo::isGlobalMemoryObject(const MachineInstr *MI) const { bool SIInstrInfo::isXDL(const MachineInstr &MI) const { unsigned Opcode = MI.getOpcode(); + if (AMDGPU::isGFX12(ST)) + return isWMMA(MI) || isSWMMAC(MI) || isDOT(MI); if (!SIInstrInfo::isMAI(MI) || isDGEMM(Opcode) || Opcode == AMDGPU::V_ACCVGPR_WRITE_B32_e64 || diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll index 1e9ef07ba7542..b8394faafbd8a 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll @@ -20,7 +20,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off @@ -64,7 +64,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off @@ -102,7 +102,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off @@ -140,7 +140,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off @@ -184,7 +184,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -280,7 +280,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -324,7 +324,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -368,7 +368,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -412,7 +412,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll index 311e76b9bb2b0..5d5c30126979f 100644 --- a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll +++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll @@ -20,7 +20,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off @@ -64,7 +64,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; GFX12-NEXT: global_store_b128 v[12:13], v[20:23], off @@ -102,7 +102,7 @@ define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off @@ -140,7 +140,7 @@ define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2 ; GFX12-NEXT: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3 ; GFX12-NEXT: global_store_b64 v[10:11], v[8:9], off @@ -184,7 +184,7 @@ define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -280,7 +280,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -324,7 +324,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -368,7 +368,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off @@ -412,7 +412,7 @@ define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2 ; GFX12-NEXT: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; GFX12-NEXT: global_store_b128 v[9:10], v[17:20], off