Skip to content

Commit

Permalink
[DAG] Fold (shl (srl x, c), c) -> and(x, m) even if srl has other uses
Browse files Browse the repository at this point in the history
If we're using shift pairs to mask, then relax the one use limit if the shift amounts are equal - we'll only be generating a single AND node.

AArch64 has a couple of regressions due to this, so I've enforced the existing one use limit inside a AArch64TargetLowering::shouldFoldConstantShiftPairToMask callback.

Part of the work to fix the regressions in D77804

Differential Revision: https://reviews.llvm.org/D125607
  • Loading branch information
RKSimon committed May 17, 2022
1 parent 6de59ca commit d40b7f0
Show file tree
Hide file tree
Showing 12 changed files with 65 additions and 62 deletions.
4 changes: 2 additions & 2 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -8937,8 +8937,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
// (and (srl x, (sub c1, c2), MASK)
// Only fold this if the inner shift has no other uses -- if it does,
// folding this will increase the total number of instructions.
// TODO - drop hasOneUse requirement if c1 == c2?
if (N0.getOpcode() == ISD::SRL && N0.hasOneUse() &&
if (N0.getOpcode() == ISD::SRL &&
(N0.getOperand(1) == N1 || N0.hasOneUse()) &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
/*AllowUndefs*/ false,
Expand Down
11 changes: 11 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -13300,6 +13300,17 @@ AArch64TargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
return true;
}

bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
const SDNode *N, CombineLevel Level) const {
assert(((N->getOpcode() == ISD::SHL &&
N->getOperand(0).getOpcode() == ISD::SRL) ||
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
// Don't allow multiuse shift folding with the same shift amount.
return N->getOperand(0)->hasOneUse();
}

bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const {
assert(Ty->isIntegerTy());
Expand Down
4 changes: 4 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.h
Expand Up @@ -644,6 +644,10 @@ class AArch64TargetLowering : public TargetLowering {
bool isDesirableToCommuteWithShift(const SDNode *N,
CombineLevel Level) const override;

/// Return true if it is profitable to fold a pair of shifts into a mask.
bool shouldFoldConstantShiftPairToMask(const SDNode *N,
CombineLevel Level) const override;

/// Returns true if it is beneficial to convert a load of a constant
/// to just the constant itself.
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Expand Down
10 changes: 5 additions & 5 deletions llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
Expand Up @@ -131,17 +131,17 @@ define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> ad
; CI-NEXT: s_load_dword s4, s[4:5], 0xc
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_load_dword s2, s[2:3], 0x0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: v_mov_b32_e32 v0, s0
; CI-NEXT: v_mov_b32_e32 v1, s1
; CI-NEXT: s_and_b32 s0, s4, 0xffff
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: s_lshr_b32 s1, s2, 16
; CI-NEXT: s_lshl_b32 s2, s1, 16
; CI-NEXT: s_or_b32 s0, s0, s2
; CI-NEXT: s_and_b32 s1, s2, 0xffff0000
; CI-NEXT: s_or_b32 s0, s0, s1
; CI-NEXT: v_mov_b32_e32 v2, s0
; CI-NEXT: s_lshr_b32 s2, s2, 16
; CI-NEXT: flat_store_dword v[0:1], v2
; CI-NEXT: ;;#ASMSTART
; CI-NEXT: ; use s1
; CI-NEXT: ; use s2
; CI-NEXT: ;;#ASMEND
; CI-NEXT: s_endpgm
%vec = load <2 x i16>, <2 x i16> addrspace(4)* %vec.ptr
Expand Down
12 changes: 6 additions & 6 deletions llvm/test/CodeGen/AMDGPU/load-lo16.ll
Expand Up @@ -595,12 +595,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_hi(i16 addrspace(3)* %in,
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_mov_b32_e32 v2, 0
; GFX803-NEXT: ds_write_b16 v2, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_mov_b32_e32 v3, 0
; GFX803-NEXT: ds_write_b16 v3, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(1)
; GFX803-NEXT: v_or_b32_e32 v0, v0, v1
; GFX803-NEXT: v_or_b32_e32 v0, v0, v2
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down Expand Up @@ -647,12 +647,12 @@ define void @load_local_lo_v2i16_reghi_vreg_multi_use_lohi(i16 addrspace(3)* noa
; GFX803-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_mov_b32 m0, -1
; GFX803-NEXT: ds_read_u16 v0, v0
; GFX803-NEXT: v_and_b32_e32 v4, 0xffff0000, v1
; GFX803-NEXT: v_lshrrev_b32_e32 v1, 16, v1
; GFX803-NEXT: s_waitcnt lgkmcnt(0)
; GFX803-NEXT: ds_write_b16 v2, v0
; GFX803-NEXT: ds_write_b16 v3, v1
; GFX803-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; GFX803-NEXT: flat_store_dword v[0:1], v0
; GFX803-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; GFX803-NEXT: s_setpc_b64 s[30:31]
Expand Down
30 changes: 14 additions & 16 deletions llvm/test/CodeGen/AMDGPU/scalar_to_vector.ll
Expand Up @@ -38,8 +38,8 @@ define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
Expand Down Expand Up @@ -85,8 +85,8 @@ define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out,
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; VI-NEXT: v_alignbit_b32 v0, v1, v0, 16
; VI-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
; VI-NEXT: v_mov_b32_e32 v1, v0
; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0
; VI-NEXT: s_endpgm
Expand All @@ -106,9 +106,9 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
Expand All @@ -123,9 +123,8 @@ define amdgpu_kernel void @scalar_to_vector_v4i16() {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
Expand All @@ -148,9 +147,9 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v1, 8, v0
; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: v_lshrrev_b32_e32 v1, 8, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_and_b32_e32 v1, 0xff00, v0
; SI-NEXT: v_lshrrev_b32_e32 v2, 8, v0
; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; SI-NEXT: v_or_b32_e32 v1, v1, v2
; SI-NEXT: v_or_b32_e32 v0, v0, v2
Expand All @@ -165,9 +164,8 @@ define amdgpu_kernel void @scalar_to_vector_v4f16() {
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b16_e32 v1, 8, v0
; VI-NEXT: v_or_b32_e32 v0, v1, v0
; VI-NEXT: v_lshrrev_b16_e32 v1, 8, v0
; VI-NEXT: v_lshlrev_b16_e32 v2, 8, v1
; VI-NEXT: v_or_b32_e32 v1, v1, v2
; VI-NEXT: v_and_b32_e32 v1, 0xffffff00, v0
; VI-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
; VI-NEXT: v_lshlrev_b32_e32 v2, 16, v1
; VI-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
; VI-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/ARM/combine-movc-sub.ll
Expand Up @@ -25,15 +25,15 @@ declare void @foo(%struct.PROOFSEARCH_HELP*, %struct.CLAUSE_HELP*)
define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search, %struct.LIST_HELP* %ClauseList, i32 %Level, %struct.LIST_HELP** nocapture %New) {
; CHECK-LABEL: test:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
; CHECK-NEXT: sub.w r9, r2, #32
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: sub.w r7, r2, #32
; CHECK-NEXT: mov r8, r0
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: mov r4, r2
; CHECK-NEXT: add.w r6, r0, r9, lsr #5
; CHECK-NEXT: add.w r6, r0, r7, lsr #5
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: lsr.w r7, r9, #5
; CHECK-NEXT: mov.w r10, #0
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_1: @ %for.inc
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
Expand All @@ -47,15 +47,15 @@ define hidden fastcc %struct.LIST_HELP* @test(%struct.PROOFSEARCH_HELP* %Search,
; CHECK-NEXT: add.w r0, r0, r6, lsl #2
; CHECK-NEXT: ldr r0, [r0, #40]
; CHECK-NEXT: it hi
; CHECK-NEXT: subhi.w r2, r9, r7, lsl #5
; CHECK-NEXT: andhi r2, r7, #31
; CHECK-NEXT: lsrs r0, r2
; CHECK-NEXT: lsls r0, r0, #31
; CHECK-NEXT: beq .LBB0_1
; CHECK-NEXT: @ %bb.3: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
; CHECK-NEXT: mov r0, r8
; CHECK-NEXT: bl foo
; CHECK-NEXT: str.w r10, [r5, #4]
; CHECK-NEXT: str.w r9, [r5, #4]
; CHECK-NEXT: b .LBB0_1
entry:
%cmp4.i.i = icmp ugt i32 %Level, 31
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/RISCV/rvv/extract-subvector.ll
Expand Up @@ -318,9 +318,8 @@ define <vscale x 1 x i8> @extract_nxv8i8_nxv1i8_7(<vscale x 8 x i8> %vec) {
; CHECK-LABEL: extract_nxv8i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: sub a0, a1, a0
; CHECK-NEXT: srli a1, a0, 3
; CHECK-NEXT: sub a0, a0, a1
; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, mu
; CHECK-NEXT: vslidedown.vx v8, v8, a0
; CHECK-NEXT: ret
Expand Down
18 changes: 8 additions & 10 deletions llvm/test/CodeGen/RISCV/rvv/insert-subvector.ll
Expand Up @@ -304,11 +304,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_7(<vscale x 16 x i8> %vec, <vsc
; CHECK-LABEL: insert_nxv16i8_nxv1i8_7:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: sub a0, a1, a0
; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu
; CHECK-NEXT: vslideup.vx v8, v10, a0
; CHECK-NEXT: srli a1, a0, 3
; CHECK-NEXT: sub a1, a0, a1
; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu
; CHECK-NEXT: vslideup.vx v8, v10, a1
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 7)
ret <vscale x 16 x i8> %v
Expand All @@ -318,11 +317,10 @@ define <vscale x 16 x i8> @insert_nxv16i8_nxv1i8_15(<vscale x 16 x i8> %vec, <vs
; CHECK-LABEL: insert_nxv16i8_nxv1i8_15:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a0, vlenb
; CHECK-NEXT: srli a0, a0, 3
; CHECK-NEXT: slli a1, a0, 3
; CHECK-NEXT: sub a0, a1, a0
; CHECK-NEXT: vsetvli zero, a1, e8, m1, tu, mu
; CHECK-NEXT: vslideup.vx v9, v10, a0
; CHECK-NEXT: srli a1, a0, 3
; CHECK-NEXT: sub a1, a0, a1
; CHECK-NEXT: vsetvli zero, a0, e8, m1, tu, mu
; CHECK-NEXT: vslideup.vx v9, v10, a1
; CHECK-NEXT: ret
%v = call <vscale x 16 x i8> @llvm.experimental.vector.insert.nxv1i8.nxv16i8(<vscale x 16 x i8> %vec, <vscale x 1 x i8> %subvec, i64 15)
ret <vscale x 16 x i8> %v
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/RISCV/rvv/legalize-load-sdnode.ll
Expand Up @@ -36,9 +36,8 @@ define <vscale x 7 x half> @load_nxv7f16(<vscale x 7 x half>* %ptr, <vscale x 7
; CHECK-LABEL: load_nxv7f16:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a2, vlenb
; CHECK-NEXT: srli a2, a2, 3
; CHECK-NEXT: slli a3, a2, 3
; CHECK-NEXT: sub a2, a3, a2
; CHECK-NEXT: srli a3, a2, 3
; CHECK-NEXT: sub a2, a2, a3
; CHECK-NEXT: vsetvli zero, a2, e16, m2, ta, mu
; CHECK-NEXT: vle16.v v8, (a0)
; CHECK-NEXT: vse16.v v8, (a1)
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/RISCV/rvv/legalize-store-sdnode.ll
Expand Up @@ -22,9 +22,8 @@ define void @store_nxv7f64(<vscale x 7 x double> %val, <vscale x 7 x double>* %p
; CHECK-LABEL: store_nxv7f64:
; CHECK: # %bb.0:
; CHECK-NEXT: csrr a1, vlenb
; CHECK-NEXT: srli a1, a1, 3
; CHECK-NEXT: slli a2, a1, 3
; CHECK-NEXT: sub a1, a2, a1
; CHECK-NEXT: srli a2, a1, 3
; CHECK-NEXT: sub a1, a1, a2
; CHECK-NEXT: vsetvli zero, a1, e64, m8, ta, mu
; CHECK-NEXT: vse64.v v8, (a0)
; CHECK-NEXT: ret
Expand Down
9 changes: 2 additions & 7 deletions llvm/test/CodeGen/SystemZ/store_nonbytesized_vecs.ll
Expand Up @@ -123,15 +123,10 @@ define void @fun2(<8 x i32> %src, <8 x i31>* %p)
define void @fun3(<3 x i31>* %src, <3 x i31>* %p)
; CHECK-LABEL: fun3:
; CHECK: # %bb.0:
; CHECK-NEXT: l %r0, 8(%r2)
; CHECK-NEXT: llgf %r0, 8(%r2)
; CHECK-NEXT: lg %r1, 0(%r2)
; CHECK-NEXT: sllg %r2, %r1, 32
; CHECK-NEXT: lr %r2, %r0
; CHECK-NEXT: st %r0, 8(%r3)
; CHECK-NEXT: srlg %r0, %r2, 32
; CHECK-NEXT: lr %r1, %r0
; CHECK-NEXT: nihh %r1, 8191
; CHECK-NEXT: stg %r1, 0(%r3)
; CHECK-NEXT: st %r0, 8(%r3)
; CHECK-NEXT: br %r14
{
%tmp = load <3 x i31>, <3 x i31>* %src
Expand Down

0 comments on commit d40b7f0

Please sign in to comment.