ValueTracking/AMDGPU: handle mbcnt in computeKnownBitsFromOperator#183229
Conversation
This helps canonicalize some address calculation. This would further help immediate folding into memory load instructions in the backend. The order changes to v_mad_u32_u24 is just because @llvm.amdgcn.mul.u24.i32 was used in codegen prepare after this change. It does not really change anything important.
|
@llvm/pr-subscribers-llvm-analysis @llvm/pr-subscribers-backend-amdgpu Author: Ruiling, Song (ruiling) ChangesThis helps canonicalize some address calculation. This would further help immediate folding into memory load instructions in the backend. The order changes to v_mad_u32_u24 is just because @llvm.amdgcn.mul.u24.i32 was used in codegen prepare after this change. It does not really change anything important. Patch is 44.10 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/183229.diff 10 Files Affected:
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 238d6e936382c..422127a250ce1 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -2265,6 +2265,16 @@ static void computeKnownBitsFromOperator(const Operator *I,
Known.Zero.setBitsFrom(KnownZeroFirstBit);
break;
}
+ case Intrinsic::amdgcn_mbcnt_hi:
+ case Intrinsic::amdgcn_mbcnt_lo: {
+ // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at
+ // most 31 + src1.
+ Known.Zero.setBitsFrom(
+ II->getIntrinsicID() == Intrinsic::amdgcn_mbcnt_lo ? 6 : 5);
+ computeKnownBits(I->getOperand(1), Known2, Q, Depth + 1);
+ Known = KnownBits::add(Known, Known2);
+ break;
+ }
case Intrinsic::vscale: {
if (!II->getParent() || !II->getFunction())
break;
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index d142f4f734890..bbdec31772127 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -40,7 +40,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX6-NEXT: s_mov_b32 s2, -1
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_readfirstlane_b32 s4, v1
-; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX6-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
@@ -66,7 +66,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2
+; GFX8-NEXT: v_mad_u32_u24 v2, 5, v0, s2
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mov_b32_e32 v0, s0
; GFX8-NEXT: v_mov_b32_e32 v1, s1
@@ -95,7 +95,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_waitcnt vmcnt(0)
; GFX9-NEXT: v_readfirstlane_b32 s2, v1
; GFX9-NEXT: v_mov_b32_e32 v2, 0
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: global_store_dword v2, v0, s[0:1]
; GFX9-NEXT: s_endpgm
@@ -123,7 +123,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W64-NEXT: s_waitcnt vmcnt(0)
; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W64-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX10W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W64-NEXT: s_endpgm
@@ -150,7 +150,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX10W32-NEXT: s_waitcnt vmcnt(0)
; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX10W32-NEXT: v_mov_b32_e32 v1, 0
-; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX10W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX10W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1]
; GFX10W32-NEXT: s_endpgm
@@ -180,7 +180,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W64-NEXT: v_mov_b32_e32 v1, 0
; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX11W64-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W64-NEXT: s_endpgm
@@ -209,7 +209,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX11W32-NEXT: v_mov_b32_e32 v1, 0
; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX11W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX11W32-NEXT: s_waitcnt lgkmcnt(0)
; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11W32-NEXT: s_endpgm
@@ -241,7 +241,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W64-NEXT: v_mov_b32_e32 v1, 0
; GFX12W64-NEXT: s_wait_alu depctr_va_sdst(0)
; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX12W64-NEXT: s_wait_kmcnt 0x0
; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W64-NEXT: s_endpgm
@@ -271,7 +271,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1
; GFX12W32-NEXT: v_mov_b32_e32 v1, 0
; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2)
-; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX12W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX12W32-NEXT: s_wait_kmcnt 0x0
; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX12W32-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 6f9ab39462c43..fbc8b812d96c9 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -59,7 +59,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
;
@@ -91,7 +91,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -123,7 +123,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
;
@@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1064-NEXT: s_endpgm
@@ -189,7 +189,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX1032-NEXT: s_endpgm
@@ -223,7 +223,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1164-NEXT: s_endpgm
@@ -256,7 +256,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
; GFX1132-NEXT: s_endpgm
@@ -291,7 +291,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: s_wait_kmcnt 0x0
; GFX1264-NEXT: v_readfirstlane_b32 s2, v1
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1264-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1264-NEXT: s_endpgm
@@ -323,7 +323,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: s_wait_kmcnt 0x0
; GFX1232-NEXT: v_readfirstlane_b32 s2, v1
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1232-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null
; GFX1232-NEXT: s_endpgm
@@ -1630,15 +1630,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX8-NEXT: buffer_wbinvl1_vol
; GFX8-NEXT: .LBB3_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: s_nop 2
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1672,7 +1673,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX9-NEXT: v_readfirstlane_b32 s3, v0
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_nop 2
@@ -1709,7 +1710,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1744,7 +1745,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, 5, v2, s[2:3]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1781,7 +1782,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -1816,7 +1817,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1132-NEXT: v_readfirstlane_b32 s3, v1
; GFX1132-NEXT: v_readfirstlane_b32 s2, v0
; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3]
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0
@@ -1854,7 +1855,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1264-NEXT: v_readfirstlane_b32 s3, v1
; GFX1264-NEXT: v_readfirstlane_b32 s2, v0
; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, 5, v2, s[2:3]
; GFX1264-NEXT: s_mov_b32 s3, 0x31016000
; GFX1264-NEXT: s_mov_b32 s2, -1
; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
@@ -1888,7 +1889,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace
; GFX1232-NEXT: v_readfirstlane_b32 s3, v1
; GFX1232-NEXT: v_readfirstlane_b32 s2, v0
; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, 5, v2, s[2:3]
; GFX1232-NEXT: s_mov_b32 s3, 0x31016000
; GFX1232-NEXT: s_mov_b32 s2, -1
; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index b71577385606a..95908b2b666cf 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -45,7 +45,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
; GFX7LESS-NEXT: s_mov_b32 s2, -1
; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1
-; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX7LESS-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX7LESS-NEXT: s_endpgm
@@ -73,7 +73,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: v_readfirstlane_b32 s4, v1
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
-; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX8-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX8-NEXT: s_endpgm
@@ -100,7 +100,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: v_readfirstlane_b32 s4, v1
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
-; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4
+; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s4
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX9-NEXT: s_endpgm
@@ -128,7 +128,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s2, v1
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1064-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s2, v1
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1032-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0
@@ -186,7 +186,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1164-NEXT: v_readfirstlane_b32 s2, v1
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1164-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -214,7 +214,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) {
; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
; GFX1132-NEXT: v_readfirstlane_b32 s2, v1
; GFX1132-NEXT: s_mov_b32 s3, 0x31016000
-; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2
+; GFX1132-NEXT: v_mad_u32_u24 v0, 5, v0, s2
; GFX1132-NEXT: s_mov_b32 s2, -1
; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0
@@ -1499,16 +1499,17 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: .LBB4_2:
; GFX8-NEXT: s_or_b64 exec, exec, s[0:1]
-; GFX8-NEXT: v_readfirstlane_b32 s2, v1
-; GFX8-NEXT: v_readfirstlane_b32 s3, v0
; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX8-NEXT: v_mov_b32_e32 v0, s3
-; GFX8-NEXT: v_mov_b32_e32 v1, s2
-; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX8-NEXT: v_readfirstlane_b32 s4, v1
+; GFX8-NEXT: v_readfirstlane_b32 s5, v0
+; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2
+; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2
+; GFX8-NEXT: v_mov_b32_e32 v2, s4
+; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v0
; GFX8-NEXT: s_mov_b32 s3, 0xf000
; GFX8-NEXT: s_mov_b32 s2, -1
+; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
-; GFX8-NEXT: s_nop 1
; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0
; GFX8-NEXT: s_endpgm
;
@@ -1535,7 +1536,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX9-NEXT: v_mov_b32_e32 v0, s3
; GFX9-NEXT: v_mov_b32_e32 v1, s2
-; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1]
+; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, v[0:1]
; GFX9-NEXT: s_mov_b32 s3, 0xf000
; GFX9-NEXT: s_mov_b32 s2, -1
; GFX9-NEXT: s_waitcnt lgkmcnt(0)
@@ -1566,7 +1567,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1064-NEXT: v_readfirstlane_b32 s3, v1
; GFX1064-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3]
+; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, s[2:3]
; GFX1064-NEXT: s_mov_b32 s3, 0x31016000
; GFX1064-NEXT: s_mov_b32 s2, -1
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
@@ -1595,7 +1596,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24
; GFX1032-NEXT: v_readfirstlane_b32 s3, v1
; GFX1032-NEXT: v_readfirstlane_b32 s2, v0
-; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3]
+; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, 5, v2, s[2:3]
; GFX1032-NEXT: s_mov_b32 s3, 0x31016000
; GFX1032-NEXT: s_mov_b32 s2, -1
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
@@ -1627,7 +1628,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; GFX1164-NEXT: v_readfirstlane_b32 s3, v1
; GFX1164-NEXT: v_readfirstlane_b32 s2, v0
; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3]
+; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3]
; GFX1164-NEXT: s_mov_b32 s3, 0x31016000
; GFX1164-NEXT: s_mov_b32 s2, -1
; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
@@ -1658,7 +1659,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) {
; G...
[truncated]
|
| ; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID1]], 65280 | ||
| ; CHECK-W64-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55 | ||
| ; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]]) | ||
| ; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 55) |
There was a problem hiding this comment.
The test check changes seem reasonable. The and with 65280 masked off the bits from mbcnt. so only the bits of the 55 was kept.
| ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 | ||
| ; GFX8-NEXT: v_readfirstlane_b32 s5, v0 | ||
| ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 | ||
| ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 | ||
| ; GFX8-NEXT: v_mov_b32_e32 v2, s4 | ||
| ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v0 |
There was a problem hiding this comment.
Is this code faster because it is using _u24 instructions, even though it is more instructions?
There was a problem hiding this comment.
I am not sure about this. but looks like the issue is very specific to gfx8. newer hardware all work well. I think v_mad_u64 are not as fast as 32bit version on gfx8.
Are there any tests that show an improvement from this? |
add a more direct test. test wave32/64.
I agree it would be better we still have a test to show the benefit. I have changed #178607 to cover this. The test check of |
🐧 Linux x64 Test Results
✅ The build succeeded and all tests passed. |
🪟 Windows x64 Test Results
✅ The build succeeded and all tests passed. |
| @@ -0,0 +1,38 @@ | |||
| ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 | |||
| ; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck --check-prefixes=WAVE32,CHECK %s | |||
There was a problem hiding this comment.
You need to reorder the check prefixes so that CHECK is first
| ; RUN: opt -passes=instcombine < %s | llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GFX1250,GFX1250-GISEL %s | ||
| ; RUN: opt -passes=instcombine < %s | llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s | ||
| ; RUN: opt -passes=instcombine < %s | llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1250 | FileCheck -check-prefixes=GFX1250,GFX1250-SDAG %s | ||
| ; RUN: opt -passes=instcombine < %s | llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s |
There was a problem hiding this comment.
Sorry that I forgot to remove the < %s for the llc input, which makes it does not work exactly as we want. It still acts as a useful end-to-end regression test.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/186/builds/16697 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/72/builds/17412 Here is the relevant piece of the build log for the reference |
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/168/builds/18225 Here is the relevant piece of the build log for the reference |
…lvm#183229) This helps canonicalize some address calculation. This would further help immediate folding into memory load instructions in the backend. The order changes to v_mad_u32_u24 is just because @llvm.amdgcn.mul.u24.i32 was used in codegen prepare after this change. It does not really change anything important.
…lvm#183229) This helps canonicalize some address calculation. This would further help immediate folding into memory load instructions in the backend. The order changes to v_mad_u32_u24 is just because @llvm.amdgcn.mul.u24.i32 was used in codegen prepare after this change. It does not really change anything important.
Changes: - Renamed MLIR pass registration functions to include `Pass` suffix (`registerCanonicalizerPass`, `registerStripDebugInfoPass`, etc.) following upstream [785490e9db54](llvm/llvm-project@785490e) ([#183950](llvm/llvm-project#183950)) - Updated `llvm-lldb-exports.patch` for upstream LLDBLog refactoring: - `InitializeLldbChannel` renamed to `InitializeLLDBChannel`/`TerminateLLDBChannel` ([d4d18248](llvm/llvm-project@d4d18248fde6)) - Functions then wrapped in `LLDBLogChannel` class ([45dbce3a](llvm/llvm-project@45dbce3a3a3e)) - Fix `executeFromDriver` to accept non-const `SmallVectorImpl` after `MutableArrayRef` stopped accepting const container sources ([c88ba88d](llvm/llvm-project@c88ba88da52b)) - Fix MGP dialect tests: add return types to `func.func` declarations now that MLIR's `FuncOp` verifier checks `ReturnLike` ops (including `mef.output`) against declared return types, following upstream [b28ec5ad1808](llvm/llvm-project@b28ec5a) ([#184612](llvm/llvm-project#184612)) - Fix GCN FileCheck patterns for AMDGPU `@llvm.amdgcn.implicitarg.ptr()` now emitting `dereferenceable(256)` return attribute, following upstream [fdc4a982f5d6](llvm/llvm-project@fdc4a982f5d6) ([#182206](llvm/llvm-project#182206)) - Update GCN shuffle test FileCheck patterns for LLVM optimizer improvements: `and`-mask simplifications (e.g. `and i32 %5, -64` → `and i32 %5, 64`) and `add nuw nsw` flag additions, enabled by upstream KnownBits analysis for `mbcnt_lo`/`mbcnt_hi` intrinsics in [686987a540bc](llvm/llvm-project@686987a) ([#183229](llvm/llvm-project#183229)) - Fix GEP element type FileCheck patterns: InstCombine now canonicalizes GEP element types to byte arrays (`float` → `[4 x i8]`, `i64` → `[8 x i8]`) - Fix `mo.while` body type mismatch error message: MLIR's `RegionBranchOpInterface` (`verifyWithRegions`) now detects yield type mismatches before `LoopLikeOpInterface`, emitting a different diagnostic, following upstream [b28ec5ad1808](llvm/llvm-project@b28ec5a) - Regenerate MLIR Python binding stubs MODULAR_ORIG_COMMIT_REV_ID: acd27a457aa9115fd91f73d7ba3338365ece3eb4
This helps canonicalize some address calculation. This would further help immediate folding into memory load instructions in the backend.
The order changes to v_mad_u32_u24 is just because @llvm.amdgcn.mul.u24.i32 was used in codegen prepare after this change. It does not really change anything important.