From 280a47128023ff56475a0b8db616044976d46461 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 7 Oct 2025 09:33:17 +0100 Subject: [PATCH] [DAG] Reassociate (add (add X, Y), X) --> add(add(X, X), Y) Attempt to bring together self-additions, to help with folding to shift/mul/address patterns --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 ++ .../AArch64/dag-ReplaceAllUsesOfValuesWith.ll | 5 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 122 ++++++++++-------- .../CodeGen/Hexagon/isel-fold-shl-zext.ll | 3 +- llvm/test/CodeGen/X86/avx-vinsertf128.ll | 12 +- 5 files changed, 87 insertions(+), 63 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 309f1bea8b77c..ea0aeffe7f02e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2926,6 +2926,14 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) { if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags())) return RADD; + // (X + Y) + X --> Y + (X + X) + SDValue X, Y; + if (sd_match(N, m_AddLike(m_OneUse(m_AddLike(m_Value(X), m_Value(Y))), + m_Deferred(X)))) + if (X != Y) + return DAG.getNode(ISD::ADD, DL, VT, Y, + DAG.getNode(ISD::ADD, DL, VT, X, X)); + // Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is // equivalent to (add x, c). // Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is diff --git a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll index d76e817e62a49..ce657aa1f0b5b 100644 --- a/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll +++ b/llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll @@ -27,10 +27,7 @@ define i64 @g(ptr %p) { ; CHECK-LABEL: g: ; CHECK: // %bb.0: -; CHECK-NEXT: ldr x8, [x0, #8] -; CHECK-NEXT: add x9, x8, x8 -; CHECK-NEXT: add x8, x9, x8 -; CHECK-NEXT: sub x0, x8, x8 +; CHECK-NEXT: mov x0, xzr ; CHECK-NEXT: ret %vec = load <2 x i64>, ptr %p, align 1 %elt = extractelement <2 x i64> %vec, i32 1 diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index b443e654350c5..32e26f4f387d1 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -1717,11 +1717,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1748,8 +1748,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0 +; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1765,13 +1765,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -1785,13 +1788,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v4, v3, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1811,12 +1817,14 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v2, v1, v0 +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -1873,8 +1881,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0 -; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0 -; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0 +; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v0 +; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm ; @@ -1901,8 +1909,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0 -; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0 +; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1918,13 +1926,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) +; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1 +; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -1938,13 +1949,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: s_waitcnt vmcnt(1) +; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 +; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1 +; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1964,12 +1978,14 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 -; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 +; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0 -; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] +; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v2, v1, v0 +; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2341,10 +2357,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0 -; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v4, v3, v1 ; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4 +; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -2370,9 +2387,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0 ; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0 +; GFX8-NEXT: v_mul_u32_u24_e32 v4, v0, v3 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2394,9 +2412,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2415,9 +2433,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2438,12 +2456,12 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v3, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, v4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -2499,9 +2517,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16 ; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0 +; GFX7-NEXT: v_mul_i32_i24_e32 v4, v0, v2 ; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4 +; GFX7-NEXT: s_waitcnt lgkmcnt(0) +; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -2527,9 +2546,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16 ; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0 +; GFX8-NEXT: v_mul_i32_i24_e32 v4, v0, v3 ; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 @@ -2551,9 +2571,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 +; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, v4 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2572,9 +2592,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1 ; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2 ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 +; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2595,12 +2615,12 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2 ; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0 -; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-DL-NEXT: v_mul_i32_i24_e32 v4, v3, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, v4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 -; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 -; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] +; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1 +; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { diff --git a/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll b/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll index fe0f7dd89babe..3697ff5ff79d5 100644 --- a/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll +++ b/llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll @@ -15,11 +15,10 @@ define dso_local void @foo(i64* nocapture noundef %buf, i32 %a, i32 %b) local_un ; CHECK: .cfi_startproc ; CHECK-NEXT: // %bb.0: // %entry ; CHECK-NEXT: { -; CHECK-NEXT: r2 = addasl(r2,r1,#1) ; CHECK-NEXT: r3 = asl(r1,#1) ; CHECK-NEXT: } ; CHECK-NEXT: { -; CHECK-NEXT: r2 = addasl(r2,r1,#1) +; CHECK-NEXT: r2 += add(r3,r3) ; CHECK-NEXT: } ; CHECK-NEXT: { ; CHECK-NEXT: jumpr r31 diff --git a/llvm/test/CodeGen/X86/avx-vinsertf128.ll b/llvm/test/CodeGen/X86/avx-vinsertf128.ll index 9a52ef3293d2e..2c997f77f280e 100644 --- a/llvm/test/CodeGen/X86/avx-vinsertf128.ll +++ b/llvm/test/CodeGen/X86/avx-vinsertf128.ll @@ -59,13 +59,13 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly { define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly { ; CHECK-LABEL: DAGCombineB: ; CHECK: # %bb.0: -; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1 +; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2 ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3 -; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1 -; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1 -; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3 +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0 +; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; CHECK-NEXT: retq %t1 = add <8 x i32> %v1, %v2 %t2 = add <8 x i32> %t1, %v1