Skip to content

Commit

Permalink
[Target][ARM] Fold or(A, B) more aggressively for I1 vectors
Browse files Browse the repository at this point in the history
This patch makes the folding of or(A, B) into not(and(not(A), not(B)))
more agressive for I1 vector. This only affects Thumb2 MVE and improves
codegen, because it removes a lot of msr/mrs instructions on VPR.P0.

This patch also adds a xor(vcmp) -> !vcmp fold for MVE.

Differential Revision: https://reviews.llvm.org/D77202
  • Loading branch information
Pierre-vh committed May 5, 2020
1 parent ffdda49 commit d5eb7ff
Show file tree
Hide file tree
Showing 6 changed files with 157 additions and 168 deletions.
87 changes: 47 additions & 40 deletions llvm/lib/Target/ARM/ARMISelLowering.cpp
Expand Up @@ -12651,58 +12651,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) {
};
}

static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
if (N->getOpcode() == ARMISD::VCMP)
return (ARMCC::CondCodes)N->getConstantOperandVal(2);
else if (N->getOpcode() == ARMISD::VCMPZ)
return (ARMCC::CondCodes)N->getConstantOperandVal(1);
else
llvm_unreachable("Not a VCMP/VCMPZ!");
}

static bool CanInvertMVEVCMP(SDValue N) {
ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
}

static SDValue PerformORCombine_i1(SDNode *N,
TargetLowering::DAGCombinerInfo &DCI,
const ARMSubtarget *Subtarget) {
// Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
// together with predicates
EVT VT = N->getValueType(0);
SDLoc DL(N);
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);

ARMCC::CondCodes CondCode0 = ARMCC::AL;
ARMCC::CondCodes CondCode1 = ARMCC::AL;
if (N0->getOpcode() == ARMISD::VCMP)
CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
->getZExtValue();
else if (N0->getOpcode() == ARMISD::VCMPZ)
CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
->getZExtValue();
if (N1->getOpcode() == ARMISD::VCMP)
CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
->getZExtValue();
else if (N1->getOpcode() == ARMISD::VCMPZ)
CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
->getZExtValue();

if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
return SDValue();

unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
auto IsFreelyInvertable = [&](SDValue V) {
if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
return CanInvertMVEVCMP(V);
return false;
};

if (!isValidMVECond(Opposite0,
N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
!isValidMVECond(Opposite1,
N1->getOperand(0)->getValueType(0).isFloatingPoint()))
// At least one operand must be freely invertable.
if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
return SDValue();

SmallVector<SDValue, 4> Ops0;
Ops0.push_back(N0->getOperand(0));
if (N0->getOpcode() == ARMISD::VCMP)
Ops0.push_back(N0->getOperand(1));
Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
SmallVector<SDValue, 4> Ops1;
Ops1.push_back(N1->getOperand(0));
if (N1->getOpcode() == ARMISD::VCMP)
Ops1.push_back(N1->getOperand(1));
Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));

SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
return DCI.DAG.getLogicalNOT(DL, And, VT);
}

/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
Expand Down Expand Up @@ -12823,6 +12809,27 @@ static SDValue PerformXORCombine(SDNode *N,
return Result;
}

if (Subtarget->hasMVEIntegerOps()) {
// fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
const TargetLowering *TLI = Subtarget->getTargetLowering();
if (TLI->isConstTrueVal(N1.getNode()) &&
(N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
if (CanInvertMVEVCMP(N0)) {
SDLoc DL(N0);
ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));

SmallVector<SDValue, 4> Ops;
Ops.push_back(N0->getOperand(0));
if (N0->getOpcode() == ARMISD::VCMP)
Ops.push_back(N0->getOperand(1));
Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
}
}
}

return SDValue();
}

Expand Down
Expand Up @@ -296,9 +296,8 @@ for.cond.cleanup: ; preds = %middle.block, %entr
define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* noalias nocapture readonly %c, i32* noalias nocapture readonly %d, i32 %N) {
; CHECK-LABEL: or_mul_reduce_add:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push {r4, r5, r6, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: ldr.w r12, [sp, #20]
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: ldr.w r12, [sp, #16]
; CHECK-NEXT: cmp.w r12, #0
; CHECK-NEXT: beq .LBB3_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
Expand All @@ -315,34 +314,27 @@ define dso_local i32 @or_mul_reduce_add(i32* noalias nocapture readonly %a, i32*
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: vctp.32 r12
; CHECK-NEXT: vmov q0, q1
; CHECK-NEXT: vstr p0, [sp] @ 4-byte Spill
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r1], #16
; CHECK-NEXT: vldrwt.u32 q2, [r0], #16
; CHECK-NEXT: vpnot
; CHECK-NEXT: vsub.i32 q1, q2, q1
; CHECK-NEXT: vcmp.i32 eq, q1, zr
; CHECK-NEXT: vmrs r5, p0
; CHECK-NEXT: vldr p0, [sp] @ 4-byte Reload
; CHECK-NEXT: vmrs r6, p0
; CHECK-NEXT: orrs r5, r6
; CHECK-NEXT: vmsr p0, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
; CHECK-NEXT: vldrwt.u32 q2, [r2], #16
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstee
; CHECK-NEXT: vcmpt.i32 ne, q1, zr
; CHECK-NEXT: vldrwe.u32 q1, [r3], #16
; CHECK-NEXT: vldrwe.u32 q2, [r2], #16
; CHECK-NEXT: vmul.i32 q1, q2, q1
; CHECK-NEXT: vadd.i32 q1, q1, q0
; CHECK-NEXT: le lr, .LBB3_2
; CHECK-NEXT: @ %bb.3: @ %middle.block
; CHECK-NEXT: vctp.32 r4
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: vaddv.u32 r0, q0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r7, pc}
; CHECK-NEXT: .LBB3_4:
; CHECK-NEXT: movs r0, #0
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop {r4, r5, r6, pc}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
%cmp8 = icmp eq i32 %N, 0
br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
Expand Down
50 changes: 20 additions & 30 deletions llvm/test/CodeGen/Thumb2/mve-pred-or.ll
Expand Up @@ -124,12 +124,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpulez_v4i1(<4 x i32> %a, <4 x i32> %b) {
; CHECK-LABEL: cmpulez_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.u32 cs, q1, zr
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand Down Expand Up @@ -247,12 +245,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpult_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-LABEL: cmpult_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.u32 hi, q2, q1
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand All @@ -266,12 +262,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpugt_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-LABEL: cmpugt_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.u32 hi, q1, q2
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand All @@ -285,12 +279,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpule_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-LABEL: cmpule_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.u32 cs, q2, q1
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand All @@ -304,12 +296,10 @@ define arm_aapcs_vfpcc <4 x i32> @cmpuge_v4i1(<4 x i32> %a, <4 x i32> %b, <4 x i
; CHECK-LABEL: cmpuge_v4i1:
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: vcmp.u32 cs, q1, q2
; CHECK-NEXT: vmrs r0, p0
; CHECK-NEXT: vcmp.i32 eq, q0, zr
; CHECK-NEXT: vmrs r1, p0
; CHECK-NEXT: orrs r0, r1
; CHECK-NEXT: vmsr p0, r0
; CHECK-NEXT: vpsel q0, q0, q1
; CHECK-NEXT: vpnot
; CHECK-NEXT: vpst
; CHECK-NEXT: vcmpt.i32 ne, q0, zr
; CHECK-NEXT: vpsel q0, q1, q0
; CHECK-NEXT: bx lr
entry:
%c1 = icmp eq <4 x i32> %a, zeroinitializer
Expand Down
32 changes: 16 additions & 16 deletions llvm/test/CodeGen/Thumb2/mve-vcmpf.ll
Expand Up @@ -483,8 +483,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ugt_v4f32(<4 x float> %src, <4 x float>
;
; CHECK-MVEFP-LABEL: vcmp_ugt_v4f32:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f32 ge, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f32 lt, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ugt <4 x float> %src, %src2
Expand Down Expand Up @@ -535,8 +535,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_uge_v4f32(<4 x float> %src, <4 x float>
;
; CHECK-MVEFP-LABEL: vcmp_uge_v4f32:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f32 gt, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f32 le, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp uge <4 x float> %src, %src2
Expand Down Expand Up @@ -587,8 +587,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ult_v4f32(<4 x float> %src, <4 x float>
;
; CHECK-MVEFP-LABEL: vcmp_ult_v4f32:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f32 ge, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f32 lt, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ult <4 x float> %src, %src2
Expand Down Expand Up @@ -639,8 +639,8 @@ define arm_aapcs_vfpcc <4 x float> @vcmp_ule_v4f32(<4 x float> %src, <4 x float>
;
; CHECK-MVEFP-LABEL: vcmp_ule_v4f32:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f32 gt, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f32 le, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ule <4 x float> %src, %src2
Expand Down Expand Up @@ -1897,8 +1897,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ugt_v8f16(<8 x half> %src, <8 x half> %s
;
; CHECK-MVEFP-LABEL: vcmp_ugt_v8f16:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f16 ge, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f16 lt, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ugt <8 x half> %src, %src2
Expand Down Expand Up @@ -2021,8 +2021,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_uge_v8f16(<8 x half> %src, <8 x half> %s
;
; CHECK-MVEFP-LABEL: vcmp_uge_v8f16:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f16 gt, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f16 le, q1, q0
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp uge <8 x half> %src, %src2
Expand Down Expand Up @@ -2145,8 +2145,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ult_v8f16(<8 x half> %src, <8 x half> %s
;
; CHECK-MVEFP-LABEL: vcmp_ult_v8f16:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f16 ge, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f16 lt, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ult <8 x half> %src, %src2
Expand Down Expand Up @@ -2269,8 +2269,8 @@ define arm_aapcs_vfpcc <8 x half> @vcmp_ule_v8f16(<8 x half> %src, <8 x half> %s
;
; CHECK-MVEFP-LABEL: vcmp_ule_v8f16:
; CHECK-MVEFP: @ %bb.0: @ %entry
; CHECK-MVEFP-NEXT: vcmp.f16 gt, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q3, q2
; CHECK-MVEFP-NEXT: vcmp.f16 le, q0, q1
; CHECK-MVEFP-NEXT: vpsel q0, q2, q3
; CHECK-MVEFP-NEXT: bx lr
entry:
%c = fcmp ule <8 x half> %src, %src2
Expand Down

0 comments on commit d5eb7ff

Please sign in to comment.