diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp index 2364fdddef31d..3f925844a4020 100644 --- a/llvm/lib/Analysis/ValueTracking.cpp +++ b/llvm/lib/Analysis/ValueTracking.cpp @@ -2265,6 +2265,16 @@ static void computeKnownBitsFromOperator(const Operator *I, Known.Zero.setBitsFrom(KnownZeroFirstBit); break; } + case Intrinsic::amdgcn_mbcnt_hi: + case Intrinsic::amdgcn_mbcnt_lo: { + // Wave64 mbcnt_lo returns at most 32 + src1. Otherwise these return at + // most 31 + src1. + Known.Zero.setBitsFrom( + II->getIntrinsicID() == Intrinsic::amdgcn_mbcnt_lo ? 6 : 5); + computeKnownBits(I->getOperand(1), Known2, Q, Depth + 1); + Known = KnownBits::add(Known, Known2); + break; + } case Intrinsic::vscale: { if (!II->getParent() || !II->getFunction()) break; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll index d142f4f734890..bbdec31772127 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -40,7 +40,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX6-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -66,7 +66,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, 5, v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -95,7 +95,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -123,7 +123,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm @@ -150,7 +150,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm @@ -180,7 +180,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_endpgm @@ -209,7 +209,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_endpgm @@ -241,7 +241,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -271,7 +271,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll index 6f9ab39462c43..fbc8b812d96c9 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -59,7 +59,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -91,7 +91,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -123,7 +123,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1064-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -189,7 +189,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1032-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -223,7 +223,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1164-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1164-NEXT: s_endpgm @@ -256,7 +256,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1132-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 ; GFX1132-NEXT: s_endpgm @@ -291,7 +291,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: s_wait_kmcnt 0x0 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1264-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1264-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1264-NEXT: s_endpgm @@ -323,7 +323,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: s_wait_kmcnt 0x0 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1232-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1232-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b32 v0, off, s[0:3], null ; GFX1232-NEXT: s_endpgm @@ -1630,15 +1630,16 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_readfirstlane_b32 s3, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1672,7 +1673,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_readfirstlane_b32 s3, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_nop 2 @@ -1709,7 +1710,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1744,7 +1745,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, 5, v2, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1781,7 +1782,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -1816,7 +1817,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 @@ -1854,7 +1855,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1264-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1264-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1264-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1264-NEXT: v_mad_co_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1264-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1264-NEXT: s_mov_b32 s2, -1 ; GFX1264-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null @@ -1888,7 +1889,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out, ptr addrspace ; GFX1232-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1232-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1232-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1232-NEXT: v_mad_co_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1232-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1232-NEXT: s_mov_b32 s2, -1 ; GFX1232-NEXT: buffer_store_b64 v[0:1], off, s[0:3], null diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll index b71577385606a..95908b2b666cf 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -45,7 +45,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm @@ -73,7 +73,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX8-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -100,7 +100,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -128,7 +128,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1064-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -156,7 +156,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1032-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -186,7 +186,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1164-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -214,7 +214,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX1132-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -1499,16 +1499,17 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_readfirstlane_b32 s3, v0 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_mov_b32_e32 v0, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX8-NEXT: v_readfirstlane_b32 s4, v1 +; GFX8-NEXT: v_readfirstlane_b32 s5, v0 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 +; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s5, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1535,7 +1536,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v0, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 -; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, v[0:1] +; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, v[0:1] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1566,7 +1567,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v2, 5, s[2:3] +; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], 5, v2, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) @@ -1595,7 +1596,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] +; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, 5, v2, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) @@ -1627,7 +1628,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1164-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) @@ -1658,7 +1659,7 @@ define amdgpu_kernel void @add_i64_constant(ptr addrspace(1) %out) { ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, v2, 5, s[2:3] +; GFX1132-NEXT: v_mad_u64_u32 v[0:1], null, 5, v2, s[2:3] ; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 40a271d08abb5..e7ade6614795c 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -37,7 +37,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX7-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 @@ -72,7 +72,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX89-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: v_readfirstlane_b32 s4, v1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 @@ -108,7 +108,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1064-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1064-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 @@ -143,7 +143,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1032-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 @@ -182,7 +182,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: s_waitcnt vmcnt(0) ; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1164-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 @@ -221,7 +221,7 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1132-NEXT: s_waitcnt vmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1132-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX1132-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll index 42c6e2d7f7ae1..70a2e396cfa06 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -39,7 +39,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX6-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -65,7 +65,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, 5, v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -94,7 +94,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -122,7 +122,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm @@ -149,7 +149,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm @@ -179,7 +179,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_endpgm @@ -208,7 +208,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_endpgm @@ -240,7 +240,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -270,7 +270,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll index c886146b94465..962590b2f7ce3 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -40,7 +40,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: v_readfirstlane_b32 s4, v1 -; GFX6-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX6-NEXT: v_mad_u32_u24 v0, 5, v0, s4 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -67,7 +67,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 -; GFX8-NEXT: v_mad_u32_u24 v2, v0, 5, s2 +; GFX8-NEXT: v_mad_u32_u24 v2, 5, v0, s2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -97,7 +97,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_endpgm @@ -126,7 +126,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W64-NEXT: s_waitcnt vmcnt(0) ; GFX10W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W64-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W64-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W64-NEXT: s_endpgm @@ -154,7 +154,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10W32-NEXT: s_waitcnt vmcnt(0) ; GFX10W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10W32-NEXT: v_mov_b32_e32 v1, 0 -; GFX10W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10W32-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10W32-NEXT: s_endpgm @@ -185,7 +185,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W64-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W64-NEXT: s_endpgm @@ -215,7 +215,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX11W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11W32-NEXT: s_endpgm @@ -248,7 +248,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W64-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12W64-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W64-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W64-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W64-NEXT: s_wait_kmcnt 0x0 ; GFX12W64-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W64-NEXT: s_endpgm @@ -278,7 +278,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12W32-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX12W32-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12W32-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12W32-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12W32-NEXT: s_wait_kmcnt 0x0 ; GFX12W32-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12W32-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll index 72e95db6da440..fe0892788ca84 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_waitcnt_for_precise_memory.ll @@ -905,7 +905,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -934,7 +934,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX90A-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm @@ -964,7 +964,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm @@ -993,7 +993,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm @@ -1024,7 +1024,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -1058,7 +1058,7 @@ define amdgpu_kernel void @atomic_add_ret_local(ptr addrspace(1) %out, ptr addrs ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_wait_alu depctr_va_sdst(0) ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm @@ -1096,7 +1096,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_endpgm @@ -1124,7 +1124,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v1 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 -; GFX90A-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX90A-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX90A-NEXT: global_store_dword v2, v0, s[0:1] ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: s_endpgm @@ -1152,7 +1152,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s2, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX10-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX10-NEXT: global_store_dword v1, v0, s[0:1] ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_endpgm @@ -1180,7 +1180,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX9-FLATSCR-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-FLATSCR-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-FLATSCR-NEXT: v_mov_b32_e32 v2, 0 -; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX9-FLATSCR-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX9-FLATSCR-NEXT: global_store_dword v2, v0, s[0:1] ; GFX9-FLATSCR-NEXT: s_waitcnt vmcnt(0) ; GFX9-FLATSCR-NEXT: s_endpgm @@ -1210,7 +1210,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX11-NEXT: v_readfirstlane_b32 s2, v1 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX11-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX11-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_endpgm @@ -1241,7 +1241,7 @@ define amdgpu_kernel void @add_i32_constant(ptr addrspace(1) %out, ptr addrspace ; GFX12-NEXT: v_readfirstlane_b32 s2, v1 ; GFX12-NEXT: v_mov_b32_e32 v1, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) -; GFX12-NEXT: v_mad_u32_u24 v0, v0, 5, s2 +; GFX12-NEXT: v_mad_u32_u24 v0, 5, v0, s2 ; GFX12-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX12-NEXT: s_wait_storecnt 0x0 ; GFX12-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll index cc12db451c74b..e08ce09fe605a 100644 --- a/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/load-saddr-offset-imm.ll @@ -11,8 +11,8 @@ define amdgpu_ps <2 x float> @global_load_scale_add_foldable_knownbits(ptr addrs ; GFX12: ; %bb.0: ; GFX12-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX12-NEXT: v_lshlrev_b32_e32 v0, 3, v0 +; GFX12-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 ; GFX12-NEXT: s_wait_loadcnt 0x0 ; GFX12-NEXT: ; return to shader part epilog ; @@ -20,9 +20,7 @@ define amdgpu_ps <2 x float> @global_load_scale_add_foldable_knownbits(ptr addrs ; GFX1250: ; %bb.0: ; GFX1250-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_WAVE_MODE, 25, 1), 1 ; GFX1250-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 -; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1250-NEXT: v_lshl_add_u32 v0, v0, 3, 0x80 -; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] +; GFX1250-NEXT: global_load_b64 v[0:1], v0, s[2:3] offset:128 scale_offset ; GFX1250-NEXT: s_wait_loadcnt 0x0 ; GFX1250-NEXT: ; return to shader part epilog %v = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/canonicalize-add-to-gep.ll b/llvm/test/Transforms/InstCombine/AMDGPU/canonicalize-add-to-gep.ll new file mode 100644 index 0000000000000..b1016f3a741fc --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AMDGPU/canonicalize-add-to-gep.ll @@ -0,0 +1,29 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize32 -passes=instcombine -S < %s | FileCheck --check-prefixes=CHECK,WAVE32 %s +; RUN: opt -mtriple=amdgcn-- -mattr=+wavefrontsize64 -passes=instcombine -S < %s | FileCheck --check-prefixes=CHECK,WAVE64 %s + +; As the addition of 32 does not overflow, it can be canonicalized as gep. +define amdgpu_ps <2 x float> @turn_add_into_gep(ptr addrspace(1) inreg %sbase) { +; CHECK-LABEL: define amdgpu_ps <2 x float> @turn_add_into_gep( +; CHECK-SAME: ptr addrspace(1) inreg [[SBASE:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[V:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; CHECK-NEXT: [[MUL:%.*]] = shl nuw nsw i32 [[V]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext nneg i32 [[MUL]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr addrspace(1) [[SBASE]], i64 [[TMP1]] +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i8, ptr addrspace(1) [[TMP2]], i64 128 +; CHECK-NEXT: [[LOAD:%.*]] = load <2 x float>, ptr addrspace(1) [[GEP]], align 8 +; CHECK-NEXT: ret <2 x float> [[LOAD]] +; + %v = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %mul = shl i32 %v, 1 + %add = add i32 %mul, 32 + %zext.offset = zext i32 %add to i64 + %gep = getelementptr inbounds float, ptr addrspace(1) %sbase, i64 %zext.offset + %load = load <2 x float>, ptr addrspace(1) %gep + ret <2 x float> %load +} + +declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; WAVE32: {{.*}} +; WAVE64: {{.*}} diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll index 4617592c70f7a..8c2e6fe19bd20 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/llvm.amdgcn.wave.shuffle.ll @@ -50,7 +50,7 @@ define i32 @test_wave_shuffle_dpp_row_share_0(i32 %val) { ; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) ; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520 +; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 112 ; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]]) ; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]] ; @@ -77,7 +77,7 @@ define i32 @test_wave_shuffle_dpp_row_share_0_no_or(i32 %val) { ; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) ; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520 +; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 112 ; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[MASKED]]) ; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]] ; @@ -159,7 +159,7 @@ define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) { ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520 +; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 48 ; CHECK-W64-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7 ; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]]) ; CHECK-W64-NEXT: ret i32 [[RES]] @@ -167,7 +167,7 @@ define i32 @test_wave_shuffle_dpp_row_share_7_lo_only(i32 %val) { ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_dpp_row_share_7_lo_only( ; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { ; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65520 +; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 48 ; CHECK-NO-WAVE-SIZE-NEXT: [[SHARE_7:%.*]] = or disjoint i32 [[MASKED]], 7 ; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[SHARE_7]]) ; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]] @@ -217,28 +217,17 @@ define i32 @test_wave_shuffle_dpp_row_share_w32_mask(i32 %val) { define i32 @test_wave_shuffle_not_quite_row_share(i32 %val) { ; CHECK-W32-LABEL: define i32 @test_wave_shuffle_not_quite_row_share( ; CHECK-W32-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W32-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W32-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280 -; CHECK-W32-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55 -; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]]) +; CHECK-W32-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 55) ; CHECK-W32-NEXT: ret i32 [[RES]] ; ; CHECK-W64-LABEL: define i32 @test_wave_shuffle_not_quite_row_share( ; CHECK-W64-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-W64-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-W64-NEXT: [[TID1:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[TID]]) -; CHECK-W64-NEXT: [[MASKED:%.*]] = and i32 [[TID1]], 65280 -; CHECK-W64-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55 -; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]]) +; CHECK-W64-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 55) ; CHECK-W64-NEXT: ret i32 [[RES]] ; ; CHECK-NO-WAVE-SIZE-LABEL: define i32 @test_wave_shuffle_not_quite_row_share( ; CHECK-NO-WAVE-SIZE-SAME: i32 [[VAL:%.*]]) #[[ATTR0]] { -; CHECK-NO-WAVE-SIZE-NEXT: [[LO:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) -; CHECK-NO-WAVE-SIZE-NEXT: [[TID:%.*]] = tail call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) -; CHECK-NO-WAVE-SIZE-NEXT: [[MASKED:%.*]] = and i32 [[TID]], 65280 -; CHECK-NO-WAVE-SIZE-NEXT: [[OR_RES:%.*]] = or disjoint i32 [[MASKED]], 55 -; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 [[OR_RES]]) +; CHECK-NO-WAVE-SIZE-NEXT: [[RES:%.*]] = tail call i32 @llvm.amdgcn.wave.shuffle.i32(i32 [[VAL]], i32 55) ; CHECK-NO-WAVE-SIZE-NEXT: ret i32 [[RES]] ; %lo = tail call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt.ll b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt.ll index bd7792f21c0a5..dfeaa872b846d 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/mbcnt.ll @@ -92,5 +92,66 @@ define i32 @ockl_lane_u32() { ret i32 %hi } +define i32 @mbcnt_lo_and63() { +; DEFAULT-LABEL: define i32 @mbcnt_lo_and63() { +; DEFAULT-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; DEFAULT-NEXT: ret i32 [[LO]] +; +; WAVE32-LABEL: define i32 @mbcnt_lo_and63 +; WAVE32-SAME: () #[[ATTR1]] { +; WAVE32-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; WAVE32-NEXT: ret i32 [[LO]] +; +; WAVE64-LABEL: define i32 @mbcnt_lo_and63 +; WAVE64-SAME: () #[[ATTR1]] { +; WAVE64-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; WAVE64-NEXT: ret i32 [[LO]] +; + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %and = and i32 %lo, 63 + ret i32 %lo +} + +define i32 @mbcnt_hi_and31() { +; DEFAULT-LABEL: define i32 @mbcnt_hi_and31() { +; DEFAULT-NEXT: [[HI:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 0) +; DEFAULT-NEXT: ret i32 [[HI]] +; +; WAVE32-LABEL: define i32 @mbcnt_hi_and31 +; WAVE32-SAME: () #[[ATTR1]] { +; WAVE32-NEXT: ret i32 0 +; +; WAVE64-LABEL: define i32 @mbcnt_hi_and31 +; WAVE64-SAME: () #[[ATTR1]] { +; WAVE64-NEXT: [[HI:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 0) +; WAVE64-NEXT: ret i32 [[HI]] +; + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 0) + %and = and i32 %hi, 31 + ret i32 %and +} + +define i32 @ockl_lane_u32_and127() { +; DEFAULT-LABEL: define i32 @ockl_lane_u32_and127() { +; DEFAULT-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; DEFAULT-NEXT: [[HI:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) +; DEFAULT-NEXT: ret i32 [[HI]] +; +; WAVE32-LABEL: define i32 @ockl_lane_u32_and127 +; WAVE32-SAME: () #[[ATTR1]] { +; WAVE32-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; WAVE32-NEXT: ret i32 [[LO]] +; +; WAVE64-LABEL: define i32 @ockl_lane_u32_and127 +; WAVE64-SAME: () #[[ATTR1]] { +; WAVE64-NEXT: [[LO:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) +; WAVE64-NEXT: [[HI:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 [[LO]]) +; WAVE64-NEXT: ret i32 [[HI]] +; + %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) + %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) + %and = and i32 %hi, 127 + ret i32 %and +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}}