Skip to content

Commit

Permalink
[AArch64][SVE] Invert VSelect operand order and condition for predica…
Browse files Browse the repository at this point in the history
…ted arithmetic operations

   (vselect (setcc ( condcode) (_) (_)) (a)          (op (a) (b)))
=> (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))

As a follow up to D117689, invert the operand order and condition
in order to fold vselects into predicated instructions.

Differential Revision: https://reviews.llvm.org/D119424
  • Loading branch information
MDevereau committed Feb 17, 2022
1 parent 2aa624a commit 2f2dcb4
Show file tree
Hide file tree
Showing 3 changed files with 153 additions and 9 deletions.
39 changes: 39 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Expand Up @@ -17152,12 +17152,51 @@ static SDValue performTBZCombine(SDNode *N,
DAG.getConstant(Bit, DL, MVT::i64), N->getOperand(3));
}

// Swap vselect operands where it may allow a predicated operation to achieve
// the `sel`.
//
// (vselect (setcc ( condcode) (_) (_)) (a) (op (a) (b)))
// => (vselect (setcc (!condcode) (_) (_)) (op (a) (b)) (a))
static SDValue trySwapVSelectOperands(SDNode *N, SelectionDAG &DAG) {
auto SelectA = N->getOperand(1);
auto SelectB = N->getOperand(2);
auto NTy = N->getValueType(0);

if (!NTy.isScalableVector())
return SDValue();
SDValue SetCC = N->getOperand(0);
if (SetCC.getOpcode() != ISD::SETCC || !SetCC.hasOneUse())
return SDValue();

switch (SelectB.getOpcode()) {
default:
return SDValue();
case ISD::FMUL:
case ISD::FSUB:
case ISD::FADD:
break;
}
if (SelectA != SelectB.getOperand(0))
return SDValue();

ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
auto InverseSetCC = DAG.getSetCC(
SDLoc(SetCC), SetCC.getValueType(), SetCC.getOperand(0),
SetCC.getOperand(1), ISD::getSetCCInverse(CC, SetCC.getValueType()));

return DAG.getNode(ISD::VSELECT, SDLoc(N), NTy,
{InverseSetCC, SelectB, SelectA});
}

// vselect (v1i1 setcc) ->
// vselect (v1iXX setcc) (XX is the size of the compared operand type)
// FIXME: Currently the type legalizer can't handle VSELECT having v1i1 as
// condition. If it can legalize "VSELECT v1i1" correctly, no need to combine
// such VSELECT.
static SDValue performVSelectCombine(SDNode *N, SelectionDAG &DAG) {
if (auto SwapResult = trySwapVSelectOperands(N, DAG))
return SwapResult;

SDValue N0 = N->getOperand(0);
EVT CCVT = N0.getValueType();

Expand Down
15 changes: 6 additions & 9 deletions llvm/test/CodeGen/AArch64/sve-fp-reciprocal.ll
Expand Up @@ -95,14 +95,13 @@ define <vscale x 8 x half> @fsqrt_recip_8f16(<vscale x 8 x half> %a) #0 {
; CHECK-NEXT: frsqrte z1.h, z0.h
; CHECK-NEXT: ptrue p0.h
; CHECK-NEXT: fmul z2.h, z1.h, z1.h
; CHECK-NEXT: fcmeq p0.h, p0/z, z0.h, #0.0
; CHECK-NEXT: fcmne p0.h, p0/z, z0.h, #0.0
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z2.h, z1.h, z1.h
; CHECK-NEXT: frsqrts z2.h, z0.h, z2.h
; CHECK-NEXT: fmul z1.h, z1.h, z2.h
; CHECK-NEXT: fmul z1.h, z0.h, z1.h
; CHECK-NEXT: sel z0.h, p0, z0.h, z1.h
; CHECK-NEXT: fmul z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> %a)
ret <vscale x 8 x half> %fsqrt
Expand All @@ -124,14 +123,13 @@ define <vscale x 4 x float> @fsqrt_recip_4f32(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: frsqrte z1.s, z0.s
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmul z2.s, z1.s, z1.s
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z2.s, z1.s, z1.s
; CHECK-NEXT: frsqrts z2.s, z0.s, z2.s
; CHECK-NEXT: fmul z1.s, z1.s, z2.s
; CHECK-NEXT: fmul z1.s, z0.s, z1.s
; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 4 x float> @llvm.sqrt.nxv4f32(<vscale x 4 x float> %a)
ret <vscale x 4 x float> %fsqrt
Expand All @@ -153,7 +151,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
; CHECK-NEXT: frsqrte z1.d, z0.d
; CHECK-NEXT: ptrue p0.d
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
; CHECK-NEXT: fcmeq p0.d, p0/z, z0.d, #0.0
; CHECK-NEXT: fcmne p0.d, p0/z, z0.d, #0.0
; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
Expand All @@ -162,8 +160,7 @@ define <vscale x 2 x double> @fsqrt_recip_2f64(<vscale x 2 x double> %a) #0 {
; CHECK-NEXT: fmul z2.d, z1.d, z1.d
; CHECK-NEXT: frsqrts z2.d, z0.d, z2.d
; CHECK-NEXT: fmul z1.d, z1.d, z2.d
; CHECK-NEXT: fmul z1.d, z0.d, z1.d
; CHECK-NEXT: sel z0.d, p0, z0.d, z1.d
; CHECK-NEXT: fmul z0.d, p0/m, z0.d, z1.d
; CHECK-NEXT: ret
%fsqrt = call fast <vscale x 2 x double> @llvm.sqrt.nxv2f64(<vscale x 2 x double> %a)
ret <vscale x 2 x double> %fsqrt
Expand Down
108 changes: 108 additions & 0 deletions llvm/test/CodeGen/AArch64/sve-select.ll
Expand Up @@ -542,3 +542,111 @@ define <vscale x 16 x i1> @icmp_select_nxv16i1(<vscale x 16 x i1> %a, <vscale x
%sel = select i1 %mask, <vscale x 16 x i1> %a, <vscale x 16 x i1> %b
ret <vscale x 16 x i1> %sel
}

define <vscale x 4 x float> @select_f32_invert_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) #0 {
; CHECK-LABEL: select_f32_invert_fmul:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fmul = fmul <vscale x 4 x float> %a, %b
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fmul
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_invert_fadd(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: select_f32_invert_fadd:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fadd z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fadd = fadd <vscale x 4 x float> %a, %b
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_invert_fsub(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: select_f32_invert_fsub:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fcmne p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fsub z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fsub = fsub <vscale x 4 x float> %a, %b
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fsub
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_no_invert_op_lhs(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: select_f32_no_invert_op_lhs:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fmul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fmul = fmul <vscale x 4 x float> %a, %b
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul, <vscale x 4 x float> %a
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_no_invert_2_op(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, <vscale x 4 x float> %d) {
; CHECK-LABEL: select_f32_no_invert_2_op:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fmul z2.s, z2.s, z3.s
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
; CHECK-NEXT: sel z0.s, p0, z0.s, z2.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fmul1 = fmul <vscale x 4 x float> %a, %b
%fmul2 = fmul <vscale x 4 x float> %c, %d
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %fmul1, <vscale x 4 x float> %fmul2
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_no_invert_equal_ops(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
; CHECK-LABEL: select_f32_no_invert_equal_ops:
; CHECK: // %bb.0:
; CHECK-NEXT: fmul z0.s, z0.s, z1.s
; CHECK-NEXT: ret
%m = fmul <vscale x 4 x float> %a, %b
%p = fcmp oeq <vscale x 4 x float> %m, zeroinitializer
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %m, <vscale x 4 x float> %m
ret <vscale x 4 x float> %sel
}

define <vscale x 4 x float> @select_f32_no_invert_fmul_two_setcc_uses(<vscale x 4 x float> %a, <vscale x 4 x float> %b, <vscale x 4 x float> %c, i32 %len) #0 {
; CHECK-LABEL: select_f32_no_invert_fmul_two_setcc_uses:
; CHECK: // %bb.0:
; CHECK-NEXT: ptrue p0.s
; CHECK-NEXT: fadd z1.s, z0.s, z1.s
; CHECK-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0
; CHECK-NEXT: sel z0.s, p0, z0.s, z1.s
; CHECK-NEXT: mov z0.s, p0/m, z2.s
; CHECK-NEXT: ret
%p = fcmp oeq <vscale x 4 x float> %a, zeroinitializer
%fadd = fadd <vscale x 4 x float> %a, %b
%sel = select <vscale x 4 x i1> %p, <vscale x 4 x float> %a, <vscale x 4 x float> %fadd
%sel2 = select <vscale x 4 x i1> %p, <vscale x 4 x float> %c, <vscale x 4 x float> %sel
ret <vscale x 4 x float> %sel2
}

define <4 x float> @select_f32_no_invert_not_scalable(<4 x float> %a, <4 x float> %b) #0 {
; CHECK-LABEL: select_f32_no_invert_not_scalable:
; CHECK: // %bb.0:
; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0
; CHECK-NEXT: fmul v1.4s, v0.4s, v1.4s
; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b
; CHECK-NEXT: ret
%p = fcmp oeq <4 x float> %a, zeroinitializer
%fmul = fmul <4 x float> %a, %b
%sel = select <4 x i1> %p, <4 x float> %a, <4 x float> %fmul
ret <4 x float> %sel
}

0 comments on commit 2f2dcb4

Please sign in to comment.