Skip to content

Commit

Permalink
[DAG] Fold (srl (shl x, c1), c2) -> and(shl/srl(x, c3), m)
Browse files Browse the repository at this point in the history
Similar to the existing (shl (srl x, c1), c2) fold

Part of the work to fix the regressions in D77804

Differential Revision: https://reviews.llvm.org/D125836
  • Loading branch information
RKSimon committed Jun 20, 2022
1 parent 26041e1 commit e4a124d
Show file tree
Hide file tree
Showing 12 changed files with 226 additions and 150 deletions.
44 changes: 35 additions & 9 deletions llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
Expand Up @@ -9419,15 +9419,41 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
}
}

// fold (srl (shl x, c), c) -> (and x, cst2)
// TODO - (srl (shl x, c1), c2).
if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
SDLoc DL(N);
SDValue Mask =
DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
AddToWorklist(Mask.getNode());
return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
// fold (srl (shl x, c1), c2) -> (and (shl x, (sub c1, c2), MASK) or
// (and (srl x, (sub c2, c1), MASK)
if (N0.getOpcode() == ISD::SHL &&
(N0.getOperand(1) == N1 || N0->hasOneUse()) &&
TLI.shouldFoldConstantShiftPairToMask(N, Level)) {
auto MatchShiftAmount = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
const APInt &LHSC = LHS->getAPIntValue();
const APInt &RHSC = RHS->getAPIntValue();
return LHSC.ult(OpSizeInBits) && RHSC.ult(OpSizeInBits) &&
LHSC.getZExtValue() <= RHSC.getZExtValue();
};
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchShiftAmount,
/*AllowUndefs*/ false,
/*AllowTypeMismatch*/ true)) {
SDLoc DL(N);
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N01, N1);
SDValue Mask = DAG.getAllOnesConstant(DL, VT);
Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N01);
Mask = DAG.getNode(ISD::SHL, DL, VT, Mask, Diff);
SDValue Shift = DAG.getNode(ISD::SHL, DL, VT, N0.getOperand(0), Diff);
return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
}
if (ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchShiftAmount,
/*AllowUndefs*/ false,
/*AllowTypeMismatch*/ true)) {
SDLoc DL(N);
SDValue N01 = DAG.getZExtOrTrunc(N0.getOperand(1), DL, ShiftVT);
SDValue Diff = DAG.getNode(ISD::SUB, DL, ShiftVT, N1, N01);
SDValue Mask = DAG.getAllOnesConstant(DL, VT);
Mask = DAG.getNode(ISD::SRL, DL, VT, Mask, N1);
SDValue Shift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), Diff);
return DAG.getNode(ISD::AND, DL, VT, Shift, Mask);
}
}

// fold (srl (anyextend x), c) -> (and (anyextend (srl x, c)), mask)
Expand Down
13 changes: 12 additions & 1 deletion llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -13411,7 +13411,18 @@ bool AArch64TargetLowering::shouldFoldConstantShiftPairToMask(
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
// Don't allow multiuse shift folding with the same shift amount.
return N->getOperand(0)->hasOneUse();
if (!N->getOperand(0)->hasOneUse())
return false;

// Only fold srl(shl(x,c1),c2) iff C1 >= C2 to prevent loss of UBFX patterns.
EVT VT = N->getValueType(0);
if (N->getOpcode() == ISD::SRL && (VT == MVT::i32 || VT == MVT::i64)) {
auto *C1 = dyn_cast<ConstantSDNode>(N->getOperand(0).getOperand(1));
auto *C2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
return (!C1 || !C2 || C1->getZExtValue() >= C2->getZExtValue());
}

return true;
}

bool AArch64TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
Expand Down
1 change: 1 addition & 0 deletions llvm/lib/Target/X86/X86ISelLowering.cpp
Expand Up @@ -5844,6 +5844,7 @@ bool X86TargetLowering::shouldFoldConstantShiftPairToMask(
(N->getOpcode() == ISD::SRL &&
N->getOperand(0).getOpcode() == ISD::SHL)) &&
"Expected shift-shift mask");
// TODO: Should we always create i64 masks? Or only folded immediates?
EVT VT = N->getValueType(0);
if ((Subtarget.hasFastVectorShiftMasks() && VT.isVector()) ||
(Subtarget.hasFastScalarShiftMasks() && !VT.isVector())) {
Expand Down
9 changes: 4 additions & 5 deletions llvm/test/CodeGen/AArch64/ushl_sat.ll
Expand Up @@ -129,11 +129,10 @@ define i16 @combine_shlsat_to_shl_no_fold(i16 %x) nounwind {
; CHECK-LABEL: combine_shlsat_to_shl_no_fold:
; CHECK: // %bb.0:
; CHECK-NEXT: and w8, w0, #0xfffc
; CHECK-NEXT: lsl w9, w8, #14
; CHECK-NEXT: lsl w8, w8, #17
; CHECK-NEXT: and w10, w9, #0x1fff0000
; CHECK-NEXT: cmp w9, w10
; CHECK-NEXT: csinv w8, w8, wzr, eq
; CHECK-NEXT: lsl w9, w8, #17
; CHECK-NEXT: lsl w8, w8, #14
; CHECK-NEXT: cmp w8, w9, lsr #3
; CHECK-NEXT: csinv w8, w9, wzr, eq
; CHECK-NEXT: lsr w0, w8, #16
; CHECK-NEXT: ret
%x2 = lshr i16 %x, 2
Expand Down
90 changes: 45 additions & 45 deletions llvm/test/CodeGen/AMDGPU/idot8s.ll
Expand Up @@ -2852,7 +2852,7 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_bfe_i32 v8, v2, 8, 4
; GFX7-NEXT: v_bfe_i32 v9, v2, 4, 4
; GFX7-NEXT: v_bfe_i32 v2, v2, 0, 4
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v3
; GFX7-NEXT: v_lshlrev_b32_e32 v3, 8, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xff, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 8, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xff, v6
Expand All @@ -2861,67 +2861,67 @@ define amdgpu_kernel void @idot8_acc8_vecMul(<8 x i4> addrspace(1)* %src1,
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v9
; GFX7-NEXT: v_and_b32_e32 v2, 0xff, v2
; GFX7-NEXT: s_waitcnt vmcnt(1)
; GFX7-NEXT: v_ashrrev_i32_e32 v11, 28, v0
; GFX7-NEXT: v_bfe_i32 v12, v0, 24, 4
; GFX7-NEXT: v_bfe_i32 v13, v0, 20, 4
; GFX7-NEXT: v_bfe_i32 v14, v0, 16, 4
; GFX7-NEXT: v_bfe_i32 v15, v0, 12, 4
; GFX7-NEXT: v_bfe_i32 v16, v0, 8, 4
; GFX7-NEXT: v_bfe_i32 v17, v0, 4, 4
; GFX7-NEXT: v_ashrrev_i32_e32 v10, 28, v0
; GFX7-NEXT: v_bfe_i32 v11, v0, 24, 4
; GFX7-NEXT: v_bfe_i32 v12, v0, 20, 4
; GFX7-NEXT: v_bfe_i32 v13, v0, 16, 4
; GFX7-NEXT: v_bfe_i32 v14, v0, 12, 4
; GFX7-NEXT: v_bfe_i32 v15, v0, 8, 4
; GFX7-NEXT: v_bfe_i32 v16, v0, 4, 4
; GFX7-NEXT: v_bfe_i32 v0, v0, 0, 4
; GFX7-NEXT: v_or_b32_e32 v4, v4, v10
; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
; GFX7-NEXT: v_or_b32_e32 v6, v8, v7
; GFX7-NEXT: v_or_b32_e32 v3, v4, v3
; GFX7-NEXT: v_or_b32_e32 v4, v6, v5
; GFX7-NEXT: v_or_b32_e32 v5, v8, v7
; GFX7-NEXT: v_or_b32_e32 v2, v2, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 8, v11
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v9, 8, v13
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v15
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v16
; GFX7-NEXT: v_lshlrev_b32_e32 v14, 8, v17
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 8, v10
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v11
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 8, v12
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v13
; GFX7-NEXT: v_lshlrev_b32_e32 v10, 8, v14
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v15
; GFX7-NEXT: v_lshlrev_b32_e32 v12, 8, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xff, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff, v5
; GFX7-NEXT: v_or_b32_e32 v7, v8, v7
; GFX7-NEXT: v_or_b32_e32 v8, v10, v9
; GFX7-NEXT: v_or_b32_e32 v9, v13, v12
; GFX7-NEXT: v_or_b32_e32 v0, v0, v14
; GFX7-NEXT: v_lshlrev_b32_e32 v6, 16, v6
; GFX7-NEXT: v_or_b32_e32 v6, v7, v6
; GFX7-NEXT: v_or_b32_e32 v7, v9, v8
; GFX7-NEXT: v_or_b32_e32 v8, v11, v10
; GFX7-NEXT: v_or_b32_e32 v0, v0, v12
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v5
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2
; GFX7-NEXT: v_or_b32_e32 v4, v5, v4
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v7
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v9
; GFX7-NEXT: v_lshlrev_b32_e32 v8, 16, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff, v0
; GFX7-NEXT: v_or_b32_e32 v2, v2, v6
; GFX7-NEXT: v_or_b32_e32 v0, v0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xff, v2
; GFX7-NEXT: v_lshlrev_b32_e32 v13, 16, v3
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4
; GFX7-NEXT: v_or_b32_e32 v2, v2, v5
; GFX7-NEXT: v_or_b32_e32 v0, v0, v8
; GFX7-NEXT: v_or_b32_e32 v4, v4, v13
; GFX7-NEXT: v_and_b32_e32 v8, 0xff, v2
; GFX7-NEXT: v_and_b32_e32 v13, 0xff, v0
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff, v8
; GFX7-NEXT: v_bfe_u32 v8, v2, 8, 8
; GFX7-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff, v7
; GFX7-NEXT: v_bfe_u32 v9, v2, 8, 8
; GFX7-NEXT: v_bfe_u32 v14, v0, 8, 8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mad_u32_u24 v1, v7, v13, v1
; GFX7-NEXT: v_or_b32_e32 v5, v6, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v6, 24, v2
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v13, v1
; GFX7-NEXT: v_or_b32_e32 v5, v7, v5
; GFX7-NEXT: v_lshrrev_b32_e32 v7, 24, v2
; GFX7-NEXT: v_bfe_u32 v2, v2, 16, 8
; GFX7-NEXT: v_lshrrev_b32_e32 v12, 24, v0
; GFX7-NEXT: v_bfe_u32 v0, v0, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v1, v8, v14, v1
; GFX7-NEXT: v_mad_u32_u24 v1, v9, v14, v1
; GFX7-NEXT: v_mad_u32_u24 v0, v2, v0, v1
; GFX7-NEXT: v_and_b32_e32 v9, 0xff, v4
; GFX7-NEXT: v_and_b32_e32 v10, 0xff, v4
; GFX7-NEXT: v_and_b32_e32 v15, 0xff, v5
; GFX7-NEXT: v_mad_u32_u24 v0, v6, v12, v0
; GFX7-NEXT: v_bfe_u32 v10, v4, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v7, v12, v0
; GFX7-NEXT: v_bfe_u32 v11, v4, 8, 8
; GFX7-NEXT: v_bfe_u32 v16, v5, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v9, v15, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v15, v0
; GFX7-NEXT: v_bfe_u32 v4, v4, 16, 8
; GFX7-NEXT: v_bfe_u32 v5, v5, 16, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v10, v16, v0
; GFX7-NEXT: v_and_b32_e32 v3, 0xff, v3
; GFX7-NEXT: v_and_b32_e32 v11, 0xff, v11
; GFX7-NEXT: v_mad_u32_u24 v0, v11, v16, v0
; GFX7-NEXT: v_bfe_u32 v3, v3, 8, 8
; GFX7-NEXT: v_bfe_u32 v6, v6, 8, 8
; GFX7-NEXT: v_mad_u32_u24 v0, v4, v5, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v11, v0
; GFX7-NEXT: v_mad_u32_u24 v0, v3, v6, v0
; GFX7-NEXT: buffer_store_byte v0, off, s[0:3], 0
; GFX7-NEXT: s_endpgm
;
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
Expand Up @@ -683,8 +683,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(
; SI-NEXT: s_mov_b32 s4, s0
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; SI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
; SI-NEXT: v_and_b32_e32 v0, 2.0, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
Expand All @@ -702,8 +702,8 @@ define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(
; VI-NEXT: s_mov_b32 s4, s0
; VI-NEXT: s_mov_b32 s5, s1
; VI-NEXT: s_waitcnt vmcnt(0)
; VI-NEXT: v_lshlrev_b32_e32 v0, 31, v0
; VI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
; VI-NEXT: v_lshlrev_b32_e32 v0, 30, v0
; VI-NEXT: v_and_b32_e32 v0, 2.0, v0
; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; VI-NEXT: s_endpgm
%x = load i32, i32 addrspace(1)* %in, align 4
Expand Down
17 changes: 6 additions & 11 deletions llvm/test/CodeGen/ARM/umulo-32.ll
Expand Up @@ -31,23 +31,18 @@ define i32 @test2(i32* %m_degree) ssp {
; CHECK-LABEL: test2:
; CHECK: @ %bb.0:
; CHECK-NEXT: push {r4, lr}
; CHECK-NEXT: movs r1, #7
; CHECK-NEXT: lsls r1, r1, #29
; CHECK-NEXT: ldr r0, [r0]
; CHECK-NEXT: mov r2, r0
; CHECK-NEXT: bics r2, r1
; CHECK-NEXT: subs r1, r0, r2
; CHECK-NEXT: ldr r1, [r0]
; CHECK-NEXT: lsls r0, r1, #3
; CHECK-NEXT: lsrs r2, r0, #3
; CHECK-NEXT: subs r1, r1, r2
; CHECK-NEXT: subs r2, r1, #1
; CHECK-NEXT: sbcs r1, r2
; CHECK-NEXT: movs r4, #0
; CHECK-NEXT: cmp r1, #0
; CHECK-NEXT: bne .LBB1_2
; CHECK-NEXT: beq .LBB1_2
; CHECK-NEXT: @ %bb.1:
; CHECK-NEXT: lsls r0, r0, #3
; CHECK-NEXT: b .LBB1_3
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: mvns r0, r4
; CHECK-NEXT: .LBB1_3:
; CHECK-NEXT: .LBB1_2:
; CHECK-NEXT: bl _Znam
; CHECK-NEXT: mov r0, r4
; CHECK-NEXT: pop {r4, pc}
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/pr32588.ll
Expand Up @@ -9,9 +9,8 @@ define void @fn1() {
; CHECK-LABEL: fn1:
; CHECK: # %bb.0:
; CHECK-NEXT: xorl %eax, %eax
; CHECK-NEXT: cmpl $1, c(%rip)
; CHECK-NEXT: sbbl %eax, %eax
; CHECK-NEXT: andl $1, %eax
; CHECK-NEXT: cmpl $0, c(%rip)
; CHECK-NEXT: sete %al
; CHECK-NEXT: movl %eax, d(%rip)
; CHECK-NEXT: retq
%t0 = load i32, i32* @c, align 4
Expand Down
5 changes: 2 additions & 3 deletions llvm/test/CodeGen/X86/pull-binop-through-shift.ll
Expand Up @@ -195,10 +195,9 @@ define i32 @and_signbit_lshr(i32 %x, i32* %dst) {
;
; X86-LABEL: and_signbit_lshr:
; X86: # %bb.0:
; X86-NEXT: movzwl 6(%esp), %eax
; X86-NEXT: shll $16, %eax
; X86-NEXT: movl 8(%esp), %ecx
; X86-NEXT: shrl $8, %eax
; X86-NEXT: movzwl 6(%esp), %eax
; X86-NEXT: shll $8, %eax
; X86-NEXT: movl %eax, (%ecx)
; X86-NEXT: retl
%t0 = and i32 %x, 4294901760 ; 0xFFFF0000
Expand Down
20 changes: 13 additions & 7 deletions llvm/test/CodeGen/X86/rotate-extract-vector.ll
Expand Up @@ -147,13 +147,19 @@ define <32 x i16> @illegal_no_extract_mul(<32 x i16> %i) nounwind {

; Result would undershift
define <4 x i64> @no_extract_shl(<4 x i64> %i) nounwind {
; CHECK-LABEL: no_extract_shl:
; CHECK: # %bb.0:
; CHECK-NEXT: vpsllq $11, %ymm0, %ymm1
; CHECK-NEXT: vpsllq $24, %ymm0, %ymm0
; CHECK-NEXT: vpsrlq $50, %ymm1, %ymm1
; CHECK-NEXT: vpor %ymm0, %ymm1, %ymm0
; CHECK-NEXT: ret{{[l|q]}}
; X86-LABEL: no_extract_shl:
; X86: # %bb.0:
; X86-NEXT: vpsllq $24, %ymm0, %ymm1
; X86-NEXT: vpsrlq $39, %ymm0, %ymm0
; X86-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: no_extract_shl:
; X64: # %bb.0:
; X64-NEXT: vpsllq $24, %ymm0, %ymm1
; X64-NEXT: vpsrlq $39, %ymm0, %ymm0
; X64-NEXT: vpternlogq $236, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %ymm1, %ymm0
; X64-NEXT: retq
%lhs_mul = shl <4 x i64> %i, <i64 11, i64 11, i64 11, i64 11>
%rhs_mul = shl <4 x i64> %i, <i64 24, i64 24, i64 24, i64 24>
%lhs_shift = lshr <4 x i64> %lhs_mul, <i64 50, i64 50, i64 50, i64 50>
Expand Down
14 changes: 7 additions & 7 deletions llvm/test/CodeGen/X86/rotate-extract.ll
Expand Up @@ -135,21 +135,21 @@ define i64 @no_extract_shl(i64 %i) nounwind {
; X86-LABEL: no_extract_shl:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl %edx, %eax
; X86-NEXT: shll $5, %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %eax, %edx
; X86-NEXT: shldl $10, %ecx, %edx
; X86-NEXT: shll $10, %ecx
; X86-NEXT: shrl $25, %eax
; X86-NEXT: shrl $20, %eax
; X86-NEXT: andl $127, %eax
; X86-NEXT: orl %ecx, %eax
; X86-NEXT: retl
;
; X64-LABEL: no_extract_shl:
; X64: # %bb.0:
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: shlq $5, %rax
; X64-NEXT: shlq $10, %rdi
; X64-NEXT: shrq $57, %rax
; X64-NEXT: shlq $10, %rax
; X64-NEXT: shrq $52, %rdi
; X64-NEXT: andl $127, %edi
; X64-NEXT: orq %rdi, %rax
; X64-NEXT: retq
%lhs_mul = shl i64 %i, 5
Expand Down

0 comments on commit e4a124d

Please sign in to comment.