Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2926,6 +2926,14 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
if (SDValue RADD = reassociateOps(ISD::ADD, DL, N0, N1, N->getFlags()))
return RADD;

// (X + Y) + X --> Y + (X + X)
SDValue X, Y;
if (sd_match(N, m_AddLike(m_OneUse(m_AddLike(m_Value(X), m_Value(Y))),
m_Deferred(X))))
if (X != Y)
return DAG.getNode(ISD::ADD, DL, VT, Y,
DAG.getNode(ISD::ADD, DL, VT, X, X));

// Reassociate (add (or x, c), y) -> (add add(x, y), c)) if (or x, c) is
// equivalent to (add x, c).
// Reassociate (add (xor x, c), y) -> (add add(x, y), c)) if (xor x, c) is
Expand Down
5 changes: 1 addition & 4 deletions llvm/test/CodeGen/AArch64/dag-ReplaceAllUsesOfValuesWith.ll
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,7 @@
define i64 @g(ptr %p) {
; CHECK-LABEL: g:
; CHECK: // %bb.0:
; CHECK-NEXT: ldr x8, [x0, #8]
; CHECK-NEXT: add x9, x8, x8
; CHECK-NEXT: add x8, x9, x8
; CHECK-NEXT: sub x0, x8, x8
; CHECK-NEXT: mov x0, xzr
; CHECK-NEXT: ret
%vec = load <2 x i64>, ptr %p, align 1
%elt = extractelement <2 x i64> %vec, i32 1
Expand Down
122 changes: 71 additions & 51 deletions llvm/test/CodeGen/AMDGPU/idot2.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1717,11 +1717,11 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, s0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_add_i32_e32 v1, vcc, v1, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
Expand All @@ -1748,8 +1748,8 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_u32_u24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -1765,13 +1765,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -1785,13 +1788,16 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_and_b32_e32 v3, 0xffff, v1
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mul_u32_u24_sdwa v3, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_and_b32_e32 v4, 0xffff, v2
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v4, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -1811,12 +1817,14 @@ define amdgpu_kernel void @udot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_and_b32_e32 v1, 0xffff, v1
; GFX10-DL-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v2, v1, v0
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jayfoad @arsenm It looks like we're missing demanded bits handling for MAD24 instructions - but I haven't found much in the DAG that handles the MAD24 opcodes at all - is this all currently done with isel patterns? Is it going to cause problems if I try to add MAD24 DAG lowering?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't have the code in front of me but I think it's mostly done in IR in AMDGPUCodeGenPrepare. We'd like to form mul24 when only the low 24 bits of a regular mul are demanded, but I don't think there's an easy way to implement a target-specific demanded bits optimization for a generic node like MUL.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we handle both AMDGPUCodeGenPrepare and a dag combine. We appear to be missing SimplifyDemandedBitsForTargetNode

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I can fine plenty in CGP/DAG for MUL24 but not much for MAD24

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, the mad24 case I think is just a td pattern

; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -1873,8 +1881,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, s0
; GFX7-NEXT: v_mad_i32_i24 v1, v3, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v1, v0
; GFX7-NEXT: v_add_i32_e32 v0, vcc, v0, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
;
Expand All @@ -1901,8 +1909,8 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, s0
; GFX8-NEXT: v_mad_i32_i24 v1, v2, v1, v0
; GFX8-NEXT: v_add_u32_e32 v2, vcc, v1, v0
; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: flat_store_dword v[0:1], v2
Expand All @@ -1918,13 +1926,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-NODL-NEXT: s_waitcnt vmcnt(1)
; GFX9-NODL-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX9-NODL-NEXT: s_waitcnt vmcnt(0)
; GFX9-NODL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-NODL-NEXT: v_bfe_i32 v4, v2, 0, 16
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-NODL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -1938,13 +1949,16 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3]
; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0
; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0
; GFX9-DL-NEXT: s_waitcnt vmcnt(1)
; GFX9-DL-NEXT: v_bfe_i32 v3, v1, 0, 16
; GFX9-DL-NEXT: s_waitcnt vmcnt(0)
; GFX9-DL-NEXT: v_mul_i32_i24_sdwa v3, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX9-DL-NEXT: v_bfe_i32 v4, v2, 0, 16
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v1, v3, v1
; GFX9-DL-NEXT: v_add_u32_e32 v1, v1, v1
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v4, v3, v1
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -1964,12 +1978,14 @@ define amdgpu_kernel void @idot2_MultipleUses_add1(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v0, 16, v1
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 16
; GFX10-DL-NEXT: v_bfe_i32 v2, v2, 0, 16
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v0
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v2, v1, v0
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -2341,10 +2357,11 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_lshrrev_b32_e32 v3, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v4, v3, v1, s0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_mul_u32_u24_e32 v4, v3, v1
; GFX7-NEXT: v_mad_u32_u24 v1, v3, v1, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v1, vcc, s0, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v0, v2, v1
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
Expand All @@ -2370,9 +2387,10 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_and_b32_e32 v2, 0xffff, v0
; GFX8-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_u32_u24 v4, v0, v3, s0
; GFX8-NEXT: v_mul_u32_u24_e32 v4, v0, v3
; GFX8-NEXT: v_mad_u32_u24 v0, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mad_u32_u24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -2394,9 +2412,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -2415,9 +2433,9 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 16, v2
; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, v4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -2438,12 +2456,12 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_u32_u24_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_mul_u32_u24_e32 v4, v3, v0
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, v4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down Expand Up @@ -2499,9 +2517,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_bfe_i32 v3, v0, 0, 16
; GFX7-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_mad_i32_i24 v4, v0, v2, s0
; GFX7-NEXT: v_mul_i32_i24_e32 v4, v0, v2
; GFX7-NEXT: v_mad_i32_i24 v0, v0, v2, v4
; GFX7-NEXT: s_waitcnt lgkmcnt(0)
; GFX7-NEXT: v_add_i32_e32 v0, vcc, s0, v0
; GFX7-NEXT: v_mad_i32_i24 v0, v3, v1, v0
; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0
; GFX7-NEXT: s_endpgm
Expand All @@ -2527,9 +2546,10 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: v_bfe_i32 v2, v0, 0, 16
; GFX8-NEXT: v_ashrrev_i32_e32 v0, 16, v0
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_mad_i32_i24 v4, v0, v3, s0
; GFX8-NEXT: v_mul_i32_i24_e32 v4, v0, v3
; GFX8-NEXT: v_mad_i32_i24 v0, v0, v3, v4
; GFX8-NEXT: s_waitcnt lgkmcnt(0)
; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0
; GFX8-NEXT: v_mad_i32_i24 v2, v2, v1, v0
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
Expand All @@ -2551,9 +2571,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-NODL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-NODL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-NODL-NEXT: s_endpgm
;
Expand All @@ -2572,9 +2592,9 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v1, 16, v1
; GFX9-DL-NEXT: v_ashrrev_i32_e32 v2, 16, v2
; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, v4
; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0
; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3
; GFX9-DL-NEXT: v_add3_u32 v1, s0, v1, v3
; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7]
; GFX9-DL-NEXT: s_endpgm
;
Expand All @@ -2595,12 +2615,12 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1,
; GFX10-DL-NEXT: s_waitcnt vmcnt(0)
; GFX10-DL-NEXT: v_ashrrev_i32_e32 v3, 16, v2
; GFX10-DL-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v2, v3, v0
; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0
; GFX10-DL-NEXT: v_mul_i32_i24_e32 v4, v3, v0
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, v4
; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0
; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0
; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1
; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7]
; GFX10-DL-NEXT: v_add3_u32 v0, s0, v0, v1
; GFX10-DL-NEXT: global_store_dword v2, v0, s[6:7]
; GFX10-DL-NEXT: s_endpgm
ptr addrspace(1) %src2,
ptr addrspace(1) nocapture %dst) {
Expand Down
3 changes: 1 addition & 2 deletions llvm/test/CodeGen/Hexagon/isel-fold-shl-zext.ll
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,10 @@ define dso_local void @foo(i64* nocapture noundef %buf, i32 %a, i32 %b) local_un
; CHECK: .cfi_startproc
; CHECK-NEXT: // %bb.0: // %entry
; CHECK-NEXT: {
; CHECK-NEXT: r2 = addasl(r2,r1,#1)
; CHECK-NEXT: r3 = asl(r1,#1)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: r2 = addasl(r2,r1,#1)
; CHECK-NEXT: r2 += add(r3,r3)
; CHECK-NEXT: }
; CHECK-NEXT: {
; CHECK-NEXT: jumpr r31
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/X86/avx-vinsertf128.ll
Original file line number Diff line number Diff line change
Expand Up @@ -59,13 +59,13 @@ define <4 x i32> @DAGCombineA(<4 x i32> %v1) nounwind readonly {
define <8 x i32> @DAGCombineB(<8 x i32> %v1, <8 x i32> %v2) nounwind readonly {
; CHECK-LABEL: DAGCombineB:
; CHECK: # %bb.0:
; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm1
; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
; CHECK-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; CHECK-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; CHECK-NEXT: vpaddd %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2
; CHECK-NEXT: vpaddd %xmm0, %xmm0, %xmm0
; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; CHECK-NEXT: retq
%t1 = add <8 x i32> %v1, %v2
%t2 = add <8 x i32> %t1, %v1
Expand Down