Reland "[PowerPC] set libcall lowering for fp setcc ops on SPE boards"#199198
Reland "[PowerPC] set libcall lowering for fp setcc ops on SPE boards"#199198Varnike wants to merge 2 commits into
Conversation
|
@llvm/pr-subscribers-llvm-selectiondag Author: Erik Enikeev (Varnike) ChangesThis is a reland of 4d01007, which was reverted by c24ab4c. The functionality is unchanged from the original patch. This version only fixes the fast-math flag propagation issue by passing Patch is 96.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199198.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2a9ecebeb1508..f76ebd2714fa0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10324,22 +10324,25 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
LHS.getValueType());
if (!LegalOperations ||
TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
+ // Propagate fast-math-flags.
+ SDNodeFlags Flags = N0->getFlags();
switch (N0Opcode) {
default:
llvm_unreachable("Unhandled SetCC Equivalent!");
case ISD::SETCC:
- return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
+ return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, SDValue(),
+ /*IsSignaling=*/false, Flags);
case ISD::SELECT_CC:
return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
- N0.getOperand(3), NotCC);
+ N0.getOperand(3), NotCC, Flags);
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: {
if (N0.hasOneUse()) {
// FIXME Can we handle multiple uses? Could we token factor the chain
// results from the new/old setcc?
SDValue SetCC =
- DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
- N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
+ DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, N0.getOperand(0),
+ N0Opcode == ISD::STRICT_FSETCCS, Flags);
CombineTo(N, SetCC);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
recursivelyDeleteUnusedNodes(N0.getNode());
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d8ed51ec9b26f..22d17edf2915f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -557,8 +557,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
// SPE supports signaling compare of f32/f64.
- setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+ // But it doesn't comply IEEE-754 rules for comparing
+ // special values like NaNs, Infs.
+ setOperationAction(ISD::SETCC, MVT::f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
} else {
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
@@ -3590,6 +3598,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
+ const SDNodeFlags Flags = Op.getNode()->getFlags();
ISD::CondCode CC =
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);
@@ -3598,8 +3607,10 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
EVT LHSVT = LHS.getValueType();
SDLoc dl(Op);
- // Soften the setcc with libcall if it is fp128.
- if (LHSVT == MVT::f128) {
+ // Soften the setcc with libcall if it is fp128 or it is SPE and fp32/fp64.
+ if (LHSVT == MVT::f128 ||
+ (Subtarget.hasSPE() && (LHSVT == MVT::f32 || LHSVT == MVT::f64) &&
+ (!Flags.hasNoNaNs() || !Flags.hasNoInfs()))) {
assert(!Subtarget.hasP9Vector() &&
"SETCC for f128 is already legal under Power9!");
softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,
@@ -3610,6 +3621,8 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (IsStrict)
return DAG.getMergeValues({LHS, Chain}, dl);
return LHS;
+ } else if (LHSVT == MVT::f32 || LHSVT == MVT::f64) {
+ return Op;
}
assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
@@ -3664,6 +3677,35 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+SDValue PPCTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ const SDNodeFlags Flags = Op->getFlags();
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ EVT LHSVT = LHS.getValueType();
+ SDLoc dl(Op);
+
+ assert(Subtarget.hasSPE() && "LowerBR_CC used only for targets with SPE");
+
+ if ((LHSVT == MVT::f32 || LHSVT == MVT::f64) && Flags.hasNoNaNs() &&
+ Flags.hasNoInfs())
+ return Op;
+
+ softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (!RHS) {
+ RHS = DAG.getConstant(0, dl, LHSVT);
+ CC = ISD::SETNE;
+ }
+
+ return DAG.getNode(ISD::BR_CC, dl, Op.getValueType(), Chain,
+ DAG.getCondCode(CC), LHS, RHS, Dest);
+}
+
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
EVT VT = Node->getValueType(0);
@@ -12762,6 +12804,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS:
case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::SSUBO:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 1778da5aba2fd..ed47ca96d5953 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -703,6 +703,7 @@ namespace llvm {
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSADDO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
index df8d37d4d3675..bb708be393562 100644
--- a/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
+++ b/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -9,7 +9,7 @@ entry:
; ELF64-LABEL: @t1a
; SPE-LABEL: @t1a
; VSX-LABEL: @t1a
- %cmp = fcmp oeq float %a, 0.000000e+00
+ %cmp = fcmp nnan ninf oeq float %a, 0.000000e+00
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -32,7 +32,7 @@ entry:
; ELF64-LABEL: @t1b
; SPE-LABEL: @t1b
; VSX-LABEL: @t1b
- %cmp = fcmp oeq float %a, -0.000000e+00
+ %cmp = fcmp nnan ninf oeq float %a, -0.000000e+00
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -55,7 +55,7 @@ entry:
; ELF64-LABEL: @t1c
; SPE-LABEL: @t1c
; VSX-LABEL: @t1c
- %cmp = fcmp oeq float -0.000000e+00, %a
+ %cmp = fcmp nnan ninf oeq float -0.000000e+00, %a
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -78,7 +78,7 @@ entry:
; ELF64-LABEL: @t2a
; SPE-LABEL: @t2a
; VSX-LABEL: @t2a
- %cmp = fcmp oeq double %a, 0.000000e+00
+ %cmp = fcmp nnan ninf oeq double %a, 0.000000e+00
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
@@ -101,7 +101,7 @@ entry:
; ELF64-LABEL: @t2b
; SPE-LABEL: @t2b
; VSX-LABEL: @t2b
- %cmp = fcmp oeq double %a, -0.000000e+00
+ %cmp = fcmp nnan ninf oeq double %a, -0.000000e+00
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
@@ -124,7 +124,7 @@ entry:
; ELF64-LABEL: @t2c
; SPE-LABEL: @t2c
; VSX-LABEL: @t2c
- %cmp = fcmp oeq double -0.000000e+00, %a
+ %cmp = fcmp nnan ninf oeq double -0.000000e+00, %a
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
index c20d319f2ac79..ddd06cf691c2e 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
@@ -4,10 +4,27 @@
define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_oeq_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: cmplwi r3, 0
+; SPE-NEXT: beq cr0, .LBB0_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB0_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"oeq", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -17,10 +34,27 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ogt_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __gtsf2
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: bgt cr0, .LBB1_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB1_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ogt", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -30,16 +64,46 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_oge_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB2_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __gesf2
+; SPE-NEXT: cmpwi cr2, r3, -1
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r28, r3
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r27
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r28
+; SPE-NEXT: crand 4*cr5+lt, 4*cr2+gt, eq
+; SPE-NEXT: bc 12, 4*cr5+lt, .LBB2_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bc 4, gt, .LBB2_3
-; SPE-NEXT: # %bb.2:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 4, gt, 0
-; SPE-NEXT: .LBB2_3:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB2_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"oge", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -49,10 +113,27 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_olt_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __ltsf2
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: blt cr0, .LBB3_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB3_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"olt", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -62,16 +143,46 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ole_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB4_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __lesf2
+; SPE-NEXT: cmpwi cr2, r3, 1
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r28, r3
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r27
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r28
+; SPE-NEXT: crand 4*cr5+lt, 4*cr2+lt, eq
+; SPE-NEXT: bc 12, 4*cr5+lt, .LBB4_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bc 4, gt, .LBB4_3
-; SPE-NEXT: # %bb.2:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 4, gt, 0
-; SPE-NEXT: .LBB4_3:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB4_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ole", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -81,13 +192,43 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_one_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __ltsf2
+; SPE-NEXT: cmpwi cr2, r3, 0
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __gtsf2
+; SPE-NEXT: bc 12, 4*cr2+lt, .LBB5_3
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: bc 12, gt, .LBB5_3
; SPE-NEXT: # %bb.2:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB5_3:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"one", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -97,13 +238,36 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ord_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB6_2
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r6
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 12(r1) # 4-byte Folded Spill
+; SPE-NEXT: stw r28, 16(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r5
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r27, r3
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r27
+; SPE-NEXT: beq cr0, .LBB6_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mr r30, r29
; SPE-NEXT: .LBB6_2:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 16(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r27, 12(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ord", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -113,15 +277,43 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ueq_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bc 12, gt, .LBB7_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, ...
[truncated]
|
|
@llvm/pr-subscribers-backend-powerpc Author: Erik Enikeev (Varnike) ChangesThis is a reland of 4d01007, which was reverted by c24ab4c. The functionality is unchanged from the original patch. This version only fixes the fast-math flag propagation issue by passing Patch is 96.59 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/199198.diff 7 Files Affected:
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2a9ecebeb1508..f76ebd2714fa0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -10324,22 +10324,25 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
LHS.getValueType());
if (!LegalOperations ||
TLI.isCondCodeLegal(NotCC, LHS.getSimpleValueType())) {
+ // Propagate fast-math-flags.
+ SDNodeFlags Flags = N0->getFlags();
switch (N0Opcode) {
default:
llvm_unreachable("Unhandled SetCC Equivalent!");
case ISD::SETCC:
- return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
+ return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, SDValue(),
+ /*IsSignaling=*/false, Flags);
case ISD::SELECT_CC:
return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
- N0.getOperand(3), NotCC);
+ N0.getOperand(3), NotCC, Flags);
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS: {
if (N0.hasOneUse()) {
// FIXME Can we handle multiple uses? Could we token factor the chain
// results from the new/old setcc?
SDValue SetCC =
- DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
- N0.getOperand(0), N0Opcode == ISD::STRICT_FSETCCS);
+ DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC, N0.getOperand(0),
+ N0Opcode == ISD::STRICT_FSETCCS, Flags);
CombineTo(N, SetCC);
DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
recursivelyDeleteUnusedNodes(N0.getNode());
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d8ed51ec9b26f..22d17edf2915f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -557,8 +557,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
// SPE supports signaling compare of f32/f64.
- setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Legal);
- setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Legal);
+ // But it doesn't comply IEEE-754 rules for comparing
+ // special values like NaNs, Infs.
+ setOperationAction(ISD::SETCC, MVT::f32, Custom);
+ setOperationAction(ISD::SETCC, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f32, Custom);
+ setOperationAction(ISD::BR_CC, MVT::f64, Custom);
} else {
// PowerPC turns FP_TO_SINT into FCTIWZ and some load/stores.
setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
@@ -3590,6 +3598,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
bool IsStrict = Op->isStrictFPOpcode();
+ const SDNodeFlags Flags = Op.getNode()->getFlags();
ISD::CondCode CC =
cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
SDValue LHS = Op.getOperand(IsStrict ? 1 : 0);
@@ -3598,8 +3607,10 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
EVT LHSVT = LHS.getValueType();
SDLoc dl(Op);
- // Soften the setcc with libcall if it is fp128.
- if (LHSVT == MVT::f128) {
+ // Soften the setcc with libcall if it is fp128 or it is SPE and fp32/fp64.
+ if (LHSVT == MVT::f128 ||
+ (Subtarget.hasSPE() && (LHSVT == MVT::f32 || LHSVT == MVT::f64) &&
+ (!Flags.hasNoNaNs() || !Flags.hasNoInfs()))) {
assert(!Subtarget.hasP9Vector() &&
"SETCC for f128 is already legal under Power9!");
softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS, Chain,
@@ -3610,6 +3621,8 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
if (IsStrict)
return DAG.getMergeValues({LHS, Chain}, dl);
return LHS;
+ } else if (LHSVT == MVT::f32 || LHSVT == MVT::f64) {
+ return Op;
}
assert(!IsStrict && "Don't know how to handle STRICT_FSETCC!");
@@ -3664,6 +3677,35 @@ SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
+SDValue PPCTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
+ const SDNodeFlags Flags = Op->getFlags();
+ SDValue Chain = Op.getOperand(0);
+ ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(1))->get();
+ SDValue LHS = Op.getOperand(2);
+ SDValue RHS = Op.getOperand(3);
+ SDValue Dest = Op.getOperand(4);
+ EVT LHSVT = LHS.getValueType();
+ SDLoc dl(Op);
+
+ assert(Subtarget.hasSPE() && "LowerBR_CC used only for targets with SPE");
+
+ if ((LHSVT == MVT::f32 || LHSVT == MVT::f64) && Flags.hasNoNaNs() &&
+ Flags.hasNoInfs())
+ return Op;
+
+ softenSetCCOperands(DAG, LHSVT, LHS, RHS, CC, dl, LHS, RHS);
+
+ // If softenSetCCOperands returned a scalar, we need to compare the result
+ // against zero to select between true and false values.
+ if (!RHS) {
+ RHS = DAG.getConstant(0, dl, LHSVT);
+ CC = ISD::SETNE;
+ }
+
+ return DAG.getNode(ISD::BR_CC, dl, Op.getValueType(), Chain,
+ DAG.getCondCode(CC), LHS, RHS, Dest);
+}
+
SDValue PPCTargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
SDNode *Node = Op.getNode();
EVT VT = Node->getValueType(0);
@@ -12762,6 +12804,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS:
case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::BR_CC: return LowerBR_CC(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::SSUBO:
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 1778da5aba2fd..ed47ca96d5953 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -703,6 +703,7 @@ namespace llvm {
SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerBR_CC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSSUBO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSADDO(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll b/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
index df8d37d4d3675..bb708be393562 100644
--- a/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
+++ b/llvm/test/CodeGen/PowerPC/fast-isel-cmp-imm.ll
@@ -9,7 +9,7 @@ entry:
; ELF64-LABEL: @t1a
; SPE-LABEL: @t1a
; VSX-LABEL: @t1a
- %cmp = fcmp oeq float %a, 0.000000e+00
+ %cmp = fcmp nnan ninf oeq float %a, 0.000000e+00
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -32,7 +32,7 @@ entry:
; ELF64-LABEL: @t1b
; SPE-LABEL: @t1b
; VSX-LABEL: @t1b
- %cmp = fcmp oeq float %a, -0.000000e+00
+ %cmp = fcmp nnan ninf oeq float %a, -0.000000e+00
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -55,7 +55,7 @@ entry:
; ELF64-LABEL: @t1c
; SPE-LABEL: @t1c
; VSX-LABEL: @t1c
- %cmp = fcmp oeq float -0.000000e+00, %a
+ %cmp = fcmp nnan ninf oeq float -0.000000e+00, %a
; ELF64: addis
; ELF64: lfs
; ELF64: fcmpu
@@ -78,7 +78,7 @@ entry:
; ELF64-LABEL: @t2a
; SPE-LABEL: @t2a
; VSX-LABEL: @t2a
- %cmp = fcmp oeq double %a, 0.000000e+00
+ %cmp = fcmp nnan ninf oeq double %a, 0.000000e+00
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
@@ -101,7 +101,7 @@ entry:
; ELF64-LABEL: @t2b
; SPE-LABEL: @t2b
; VSX-LABEL: @t2b
- %cmp = fcmp oeq double %a, -0.000000e+00
+ %cmp = fcmp nnan ninf oeq double %a, -0.000000e+00
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
@@ -124,7 +124,7 @@ entry:
; ELF64-LABEL: @t2c
; SPE-LABEL: @t2c
; VSX-LABEL: @t2c
- %cmp = fcmp oeq double -0.000000e+00, %a
+ %cmp = fcmp nnan ninf oeq double -0.000000e+00, %a
; ELF64: addis
; ELF64: lfd
; ELF64: fcmpu
diff --git a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
index c20d319f2ac79..ddd06cf691c2e 100644
--- a/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
+++ b/llvm/test/CodeGen/PowerPC/fp-strict-fcmp-spe.ll
@@ -4,10 +4,27 @@
define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_oeq_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: cmplwi r3, 0
+; SPE-NEXT: beq cr0, .LBB0_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB0_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"oeq", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -17,10 +34,27 @@ define i32 @test_f32_oeq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ogt_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __gtsf2
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: bgt cr0, .LBB1_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB1_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ogt", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -30,16 +64,46 @@ define i32 @test_f32_ogt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_oge_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB2_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __gesf2
+; SPE-NEXT: cmpwi cr2, r3, -1
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r28, r3
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r27
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r28
+; SPE-NEXT: crand 4*cr5+lt, 4*cr2+gt, eq
+; SPE-NEXT: bc 12, 4*cr5+lt, .LBB2_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bc 4, gt, .LBB2_3
-; SPE-NEXT: # %bb.2:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 4, gt, 0
-; SPE-NEXT: .LBB2_3:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB2_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"oge", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -49,10 +113,27 @@ define i32 @test_f32_oge_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_olt_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: bl __ltsf2
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: blt cr0, .LBB3_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB3_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"olt", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -62,16 +143,46 @@ define i32 @test_f32_olt_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ole_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB4_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __lesf2
+; SPE-NEXT: cmpwi cr2, r3, 1
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r28, r3
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r27
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r28
+; SPE-NEXT: crand 4*cr5+lt, 4*cr2+lt, eq
+; SPE-NEXT: bc 12, 4*cr5+lt, .LBB4_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bc 4, gt, .LBB4_3
-; SPE-NEXT: # %bb.2:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 4, gt, 0
-; SPE-NEXT: .LBB4_3:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB4_2:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ole", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -81,13 +192,43 @@ define i32 @test_f32_ole_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_one_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r5
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 28(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r27, r5
+; SPE-NEXT: stw r28, 32(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r6
+; SPE-NEXT: bl __ltsf2
+; SPE-NEXT: cmpwi cr2, r3, 0
+; SPE-NEXT: mr r3, r27
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __gtsf2
+; SPE-NEXT: bc 12, 4*cr2+lt, .LBB5_3
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpgt cr0, r5, r6
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: cmpwi r3, 0
+; SPE-NEXT: bc 12, gt, .LBB5_3
; SPE-NEXT: # %bb.2:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r30, r29
+; SPE-NEXT: .LBB5_3:
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 40(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 36(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 32(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r12, 24(r1)
+; SPE-NEXT: lwz r27, 28(r1) # 4-byte Folded Reload
+; SPE-NEXT: mtcrf 32, r12 # cr2
+; SPE-NEXT: lwz r0, 52(r1)
+; SPE-NEXT: addi r1, r1, 48
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"one", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -97,13 +238,36 @@ define i32 @test_f32_one_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ord_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmpeq cr0, r6, r6
-; SPE-NEXT: bc 4, gt, .LBB6_2
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -32(r1)
+; SPE-NEXT: stw r0, 36(r1)
+; SPE-NEXT: stw r29, 20(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 24(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, r6
+; SPE-NEXT: mr r4, r6
+; SPE-NEXT: stw r27, 12(r1) # 4-byte Folded Spill
+; SPE-NEXT: stw r28, 16(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r28, r5
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: mr r27, r3
+; SPE-NEXT: mr r3, r28
+; SPE-NEXT: mr r4, r28
+; SPE-NEXT: bl __eqsf2
+; SPE-NEXT: or. r3, r3, r27
+; SPE-NEXT: beq cr0, .LBB6_2
; SPE-NEXT: # %bb.1:
-; SPE-NEXT: efscmpeq cr0, r5, r5
-; SPE-NEXT: bclr 12, gt, 0
+; SPE-NEXT: mr r30, r29
; SPE-NEXT: .LBB6_2:
-; SPE-NEXT: mr r3, r4
+; SPE-NEXT: mr r3, r30
+; SPE-NEXT: lwz r30, 24(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r29, 20(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r28, 16(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r27, 12(r1) # 4-byte Folded Reload
+; SPE-NEXT: lwz r0, 36(r1)
+; SPE-NEXT: addi r1, r1, 32
+; SPE-NEXT: mtlr r0
; SPE-NEXT: blr
%cond = call i1 @llvm.experimental.constrained.fcmps.f32(float %f1, float %f2, metadata !"ord", metadata !"fpexcept.strict") #0
%res = select i1 %cond, i32 %a, i32 %b
@@ -113,15 +277,43 @@ define i32 @test_f32_ord_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
define i32 @test_f32_ueq_s(i32 %a, i32 %b, float %f1, float %f2) #0 {
; SPE-LABEL: test_f32_ueq_s:
; SPE: # %bb.0:
-; SPE-NEXT: efscmplt cr0, r5, r6
-; SPE-NEXT: bc 12, gt, .LBB7_3
+; SPE-NEXT: mflr r0
+; SPE-NEXT: stwu r1, -48(r1)
+; SPE-NEXT: mfcr r12
+; SPE-NEXT: stw r0, 52(r1)
+; SPE-NEXT: stw r12, 24(r1)
+; SPE-NEXT: stw r29, 36(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r29, r4
+; SPE-NEXT: stw r30, 40(r1) # 4-byte Folded Spill
+; SPE-NEXT: mr r30, r3
+; SPE-NEXT: mr r3, ...
[truncated]
|
You can test this locally with the following command:git-clang-format --diff origin/main HEAD --extensions cpp,h -- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp llvm/lib/Target/PowerPC/PPCISelLowering.cpp llvm/lib/Target/PowerPC/PPCISelLowering.h --diff_from_common_commit
View the diff from clang-format here.diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 22d17edf2..e058143db 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12804,7 +12804,8 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::STRICT_FSETCC:
case ISD::STRICT_FSETCCS:
case ISD::SETCC: return LowerSETCC(Op, DAG);
- case ISD::BR_CC: return LowerBR_CC(Op, DAG);
+ case ISD::BR_CC:
+ return LowerBR_CC(Op, DAG);
case ISD::INIT_TRAMPOLINE: return LowerINIT_TRAMPOLINE(Op, DAG);
case ISD::ADJUST_TRAMPOLINE: return LowerADJUST_TRAMPOLINE(Op, DAG);
case ISD::SSUBO:
|
|
Drive-by comment: it's usually easier to review reland patches if the pull request contains separate commits for the original patch and the fix. |
This is original patch introduced in llvm#153238
2b20b5e to
2790f2f
Compare
|
Done, thanks for the advice |
This is a reland of 4d01007, which was reverted by c24ab4c.
The functionality is unchanged from the original patch. This version only fixes the fast-math flag propagation issue by passing
SDNodeFlagsexplicitly toDAG.getSetCC()(same as #199105 which was closed because the underlying patch had been reverted).