diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h index df6ce0fe1b037..a4ab3ef1de30c 100644 --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -2322,6 +2322,10 @@ class SelectionDAG { /// +nan are considered positive, -0.0, -inf and -nan are not. LLVM_ABI bool cannotBeOrderedNegativeFP(SDValue Op) const; + /// Check if all uses of a floating-point value are insensitive to signed + /// zeros. + LLVM_ABI bool allUsesSignedZeroInsensitive(SDValue Op) const; + /// Test whether two SDValues are known to compare equal. This /// is true if they are the same value, or if one is negative zero and the /// other positive zero. diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 2372d7dfe7c3c..73aed33fe0838 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -18891,12 +18891,13 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG, bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP; assert(IsSigned || IsUnsigned); - bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath; + bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath || + DAG.allUsesSignedZeroInsensitive(SDValue(N, 0)); // For signed conversions: The optimization changes signed zero behavior. if (IsSigned && !IsSignedZeroSafe) return SDValue(); // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0 - // (unless NoSignedZerosFPMath is set). + // (unless outputting a signed zero is OK). if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT)) return SDValue(); @@ -19375,10 +19376,17 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) { // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't // know it was called from a context with a nsz flag if the input fsub does // not. - if (N0.getOpcode() == ISD::FSUB && N->getFlags().hasNoSignedZeros() && - N0.hasOneUse()) { - return DAG.getNode(ISD::FSUB, SDLoc(N), VT, N0.getOperand(1), - N0.getOperand(0)); + if (N0.getOpcode() == ISD::FSUB && N0.hasOneUse()) { + SDValue X = N0.getOperand(0); + SDValue Y = N0.getOperand(1); + + // Safe if NoSignedZeros, or if we can prove X != Y (avoiding the -0.0 vs + // +0.0 issue) For now, we use a conservative check: if either operand is + // known never zero, then X - Y can't produce a signed zero from X == Y. + if (N->getFlags().hasNoSignedZeros() || DAG.isKnownNeverZeroFloat(X) || + DAG.isKnownNeverZeroFloat(Y)) { + return DAG.getNode(ISD::FSUB, SDLoc(N), VT, Y, X); + } } if (SimplifyDemandedBits(SDValue(N, 0))) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 379242ec5a157..61b70ffd26e2f 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6075,6 +6075,35 @@ bool SelectionDAG::isKnownNeverZeroFloat(SDValue Op) const { Op, [](ConstantFPSDNode *C) { return !C->isZero(); }); } +bool SelectionDAG::allUsesSignedZeroInsensitive(SDValue Op) const { + assert(Op.getValueType().isFloatingPoint()); + return all_of(Op->uses(), [&](SDUse &Use) { + SDNode *User = Use.getUser(); + unsigned OperandNo = Use.getOperandNo(); + + // Check if this use is insensitive to the sign of zero + switch (User->getOpcode()) { + case ISD::SETCC: + // Comparisons: IEEE-754 specifies +0.0 == -0.0. + case ISD::FABS: + // fabs always produces +0.0. + return true; + case ISD::FCOPYSIGN: + // copysign overwrites the sign bit of the first operand. + return OperandNo == 0; + case ISD::FADD: + case ISD::FSUB: { + // Arithmetic with non-zero constants fixes the uncertainty around the + // sign bit. + SDValue Other = User->getOperand(1 - OperandNo); + return isKnownNeverZeroFloat(Other); + } + default: + return false; + } + }); +} + bool SelectionDAG::isKnownNeverZero(SDValue Op, unsigned Depth) const { if (Depth >= MaxRecursionDepth) return false; // Limit search depth. diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll index 9a8c555953611..cac155e256572 100644 --- a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll @@ -134,7 +134,111 @@ entry: ret float %f } +define i1 @test_fcmp(float %x) { +; CHECK-LABEL: test_fcmp: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: fcmp s0, #0.0 +; CHECK-NEXT: cset w0, eq +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_fcmp: +; NO-SIGNED-ZEROS: // %bb.0: +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: fcmp s0, #0.0 +; NO-SIGNED-ZEROS-NEXT: cset w0, eq +; NO-SIGNED-ZEROS-NEXT: ret + %conv1 = fptosi float %x to i32 + %conv2 = sitofp i32 %conv1 to float + %cmp = fcmp oeq float %conv2, 0.0 + ret i1 %cmp +} + +define float @test_fabs(float %x) { +; CHECK-LABEL: test_fabs: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: fabs s0, s0 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_fabs: +; NO-SIGNED-ZEROS: // %bb.0: +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: fabs s0, s0 +; NO-SIGNED-ZEROS-NEXT: ret + %conv1 = fptosi float %x to i32 + %conv2 = sitofp i32 %conv1 to float + %abs = call float @llvm.fabs.f32(float %conv2) + ret float %abs +} + +define float @test_copysign(float %x, float %y) { +; CHECK-LABEL: test_copysign: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: mvni v2.4s, #128, lsl #24 +; CHECK-NEXT: // kill: def $s1 killed $s1 def $q1 +; CHECK-NEXT: bif v0.16b, v1.16b, v2.16b +; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_copysign: +; NO-SIGNED-ZEROS: // %bb.0: +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: mvni v2.4s, #128, lsl #24 +; NO-SIGNED-ZEROS-NEXT: // kill: def $s1 killed $s1 def $q1 +; NO-SIGNED-ZEROS-NEXT: bif v0.16b, v1.16b, v2.16b +; NO-SIGNED-ZEROS-NEXT: // kill: def $s0 killed $s0 killed $q0 +; NO-SIGNED-ZEROS-NEXT: ret + %conv1 = fptosi float %x to i32 + %conv2 = sitofp i32 %conv1 to float + %combine = call float @llvm.copysign.f32(float %conv2, float %y) + ret float %combine +} + +define float @test_fadd(float %x) { +; CHECK-LABEL: test_fadd: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: fmov s1, #1.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_fadd: +; NO-SIGNED-ZEROS: // %bb.0: +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: fmov s1, #1.00000000 +; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: ret + %conv1 = fptosi float %x to i32 + %conv2 = sitofp i32 %conv1 to float + %add = fadd float %conv2, 1.0 + ret float %add +} + +define float @test_fsub(float %x) { +; CHECK-LABEL: test_fsub: +; CHECK: // %bb.0: +; CHECK-NEXT: frintz s0, s0 +; CHECK-NEXT: fmov s1, #-1.00000000 +; CHECK-NEXT: fadd s0, s0, s1 +; CHECK-NEXT: ret +; +; NO-SIGNED-ZEROS-LABEL: test_fsub: +; NO-SIGNED-ZEROS: // %bb.0: +; NO-SIGNED-ZEROS-NEXT: frintz s0, s0 +; NO-SIGNED-ZEROS-NEXT: fmov s1, #-1.00000000 +; NO-SIGNED-ZEROS-NEXT: fadd s0, s0, s1 +; NO-SIGNED-ZEROS-NEXT: ret + %conv1 = fptosi float %x to i32 + %conv2 = sitofp i32 %conv1 to float + %sub = fsub float %conv2, 1.0 + ret float %sub +} + declare i32 @llvm.smin.i32(i32, i32) declare i32 @llvm.smax.i32(i32, i32) declare i32 @llvm.umin.i32(i32, i32) declare i32 @llvm.umax.i32(i32, i32) +declare float @llvm.fabs.f32(float) +declare float @llvm.copysign.f32(float, float) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll index 1222d0efd62bb..e6ec28b8ed5e8 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.f16.ll @@ -2615,65 +2615,43 @@ define half @select_fneg_posk_src_add_f16(i32 %c, half %x, half %y) { } define half @select_fneg_posk_src_sub_f16(i32 %c, half %x) { -; CI-SAFE-LABEL: select_fneg_posk_src_sub_f16: -; CI-SAFE: ; %bb.0: -; CI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-SAFE-NEXT: v_add_f32_e32 v1, -4.0, v1 -; CI-SAFE-NEXT: v_cndmask_b32_e64 v0, 2.0, -v1, vcc -; CI-SAFE-NEXT: s_setpc_b64 s[30:31] +; CI-LABEL: select_fneg_posk_src_sub_f16: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; CI-NEXT: v_sub_f32_e32 v1, 4.0, v1 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc +; CI-NEXT: s_setpc_b64 s[30:31] ; -; VI-SAFE-LABEL: select_fneg_posk_src_sub_f16: -; VI-SAFE: ; %bb.0: -; VI-SAFE-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-SAFE-NEXT: v_add_f16_e32 v1, -4.0, v1 -; VI-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v1 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, 0x4000 -; VI-SAFE-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-SAFE-NEXT: s_setpc_b64 s[30:31] +; VI-LABEL: select_fneg_posk_src_sub_f16: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_sub_f16_e32 v1, 4.0, v1 +; VI-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-TRUE16-LABEL: select_fneg_posk_src_sub_f16: ; GFX11-SAFE-TRUE16: ; %bb.0: ; GFX11-SAFE-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v1.l, -4.0, v1.l ; GFX11-SAFE-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_xor_b16 v0.l, 0x8000, v1.l +; GFX11-SAFE-TRUE16-NEXT: v_sub_f16_e32 v0.l, 4.0, v1.l +; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo ; GFX11-SAFE-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SAFE-FAKE16-LABEL: select_fneg_posk_src_sub_f16: ; GFX11-SAFE-FAKE16: ; %bb.0: ; GFX11-SAFE-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SAFE-FAKE16-NEXT: v_add_f16_e32 v1, -4.0, v1 +; GFX11-SAFE-FAKE16-NEXT: v_sub_f16_e32 v1, 4.0, v1 ; GFX11-SAFE-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-FAKE16-NEXT: v_xor_b32_e32 v1, 0x8000, v1 +; GFX11-SAFE-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SAFE-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v1, vcc_lo ; GFX11-SAFE-FAKE16-NEXT: s_setpc_b64 s[30:31] ; -; CI-NSZ-LABEL: select_fneg_posk_src_sub_f16: -; CI-NSZ: ; %bb.0: -; CI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, v1 -; CI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; CI-NSZ-NEXT: v_sub_f32_e32 v1, 4.0, v1 -; CI-NSZ-NEXT: v_cndmask_b32_e32 v0, 2.0, v1, vcc -; CI-NSZ-NEXT: s_setpc_b64 s[30:31] -; -; VI-NSZ-LABEL: select_fneg_posk_src_sub_f16: -; VI-NSZ: ; %bb.0: -; VI-NSZ-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NSZ-NEXT: v_sub_f16_e32 v1, 4.0, v1 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, 0x4000 -; VI-NSZ-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-NSZ-NEXT: s_setpc_b64 s[30:31] -; ; GFX11-NSZ-TRUE16-LABEL: select_fneg_posk_src_sub_f16: ; GFX11-NSZ-TRUE16: ; %bb.0: ; GFX11-NSZ-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll index 9814ed80befbf..f654c8a855394 100644 --- a/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll +++ b/llvm/test/CodeGen/AMDGPU/select-fabs-fneg-extract.v2f16.ll @@ -3277,39 +3277,29 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; CI-LABEL: select_fneg_posk_src_sub_v2f16: ; CI: ; %bb.0: ; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 -; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_add_f32_e32 v3, -4.0, v3 -; CI-NEXT: v_add_f32_e32 v2, -4.0, v2 ; CI-NEXT: v_cvt_f16_f32_e32 v3, v3 -; CI-NEXT: v_cvt_f16_f32_e32 v2, v2 -; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; CI-NEXT: v_or_b32_e32 v2, v2, v3 -; CI-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 -; CI-NEXT: v_cvt_f32_f16_e32 v3, v2 -; CI-NEXT: v_lshrrev_b32_e32 v2, 16, v2 +; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CI-NEXT: v_cvt_f32_f16_e32 v2, v2 -; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v3, vcc +; CI-NEXT: v_cvt_f32_f16_e32 v3, v3 +; CI-NEXT: v_sub_f32_e32 v2, 4.0, v2 +; CI-NEXT: v_sub_f32_e32 v3, 4.0, v3 +; CI-NEXT: v_cndmask_b32_e32 v0, 2.0, v2, vcc ; CI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v2, vcc +; CI-NEXT: v_cndmask_b32_e32 v1, 2.0, v3, vcc ; CI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: select_fneg_posk_src_sub_v2f16: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; VI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; VI-NEXT: v_mov_b32_e32 v1, 0xc400 -; VI-NEXT: v_add_f16_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI-NEXT: v_add_f16_e32 v2, -4.0, v2 -; VI-NEXT: v_or_b32_e32 v1, v2, v1 -; VI-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 -; VI-NEXT: v_mov_b32_e32 v2, 0x4000 +; VI-NEXT: v_mov_b32_e32 v1, 0x4400 +; VI-NEXT: v_sub_f16_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_sub_f16_e32 v2, 4.0, v2 +; VI-NEXT: v_mov_b32_e32 v3, 0x4000 ; VI-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; VI-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] -; VI-NEXT: v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI-NEXT: v_cndmask_b32_e64 v0, v3, v2, s[4:5] +; VI-NEXT: v_cndmask_b32_sdwa v1, v3, v1, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; VI-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; VI-NEXT: s_setpc_b64 s[30:31] ; @@ -3317,8 +3307,7 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GFX9-NEXT: v_pk_add_f16 v1, v2, -4.0 op_sel_hi:[1,0] -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80008000, v1 +; GFX9-NEXT: v_pk_add_f16 v1, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX9-NEXT: v_mov_b32_e32 v2, 0x4000 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v1, s[4:5] @@ -3330,28 +3319,25 @@ define <2 x half> @select_fneg_posk_src_sub_v2f16(<2 x i32> %c, <2 x half> %x) { ; GFX11-TRUE16-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX11-TRUE16-NEXT: v_pk_add_f16 v0, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, 0x80008000, v2 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x4000, v0.l, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-TRUE16-NEXT: v_cndmask_b16 v0.h, 0x4000, v0.h, s0 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: select_fneg_posk_src_sub_v2f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, -4.0 op_sel_hi:[1,0] +; GFX11-FAKE16-NEXT: v_pk_add_f16 v2, v2, 4.0 op_sel_hi:[1,0] neg_lo:[1,0] neg_hi:[1,0] ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_xor_b32_e32 v2, 0x80008000, v2 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3) ; GFX11-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v2 ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v0, 0x4000, v2, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v1, 0x4000, v3, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp = icmp eq <2 x i32> %c, zeroinitializer diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll index 131e279aa645c..8aebf8eaa62e7 100644 --- a/llvm/test/CodeGen/X86/setoeq.ll +++ b/llvm/test/CodeGen/X86/setoeq.ll @@ -18,8 +18,7 @@ define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone { ; AVX-LABEL: oeq_f64_i32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; AVX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax @@ -29,8 +28,7 @@ define zeroext i8 @oeq_f64_i32(double %x) nounwind readnone { ; AVX512-LABEL: oeq_f64_i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm1 -; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -67,16 +65,7 @@ define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone { ; AVX-LABEL: oeq_f64_u32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vcvttsd2si %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 -; AVX-NEXT: vcvttsd2si %xmm1, %edx -; AVX-NEXT: andl %ecx, %edx -; AVX-NEXT: orl %eax, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax @@ -86,8 +75,7 @@ define zeroext i8 @oeq_f64_u32(double %x) nounwind readnone { ; AVX512-LABEL: oeq_f64_u32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttsd2usi %xmm0, %eax -; AVX512-NEXT: vcvtusi2sd %eax, %xmm7, %xmm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -131,35 +119,21 @@ define zeroext i8 @oeq_f64_i64(double %x) nounwind readnone { ; ; AVX-LABEL: oeq_f64_i64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: andl $-8, %esp -; AVX-NEXT: subl $24, %esp ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd %xmm0, (%esp) -; AVX-NEXT: fldl (%esp) -; AVX-NEXT: fisttpll (%esp) -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-NEXT: vcmpeqsd {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: movl %ebp, %esp -; AVX-NEXT: popl %ebp ; AVX-NEXT: retl ; ; AVX512-LABEL: oeq_f64_i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1 -; AVX512-NEXT: vcvtqq2pd %ymm1, %ymm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retl entry: %0 = fptosi double %x to i64 @@ -216,48 +190,21 @@ define zeroext i8 @oeq_f64_u64(double %x) nounwind readnone { ; ; AVX-LABEL: oeq_f64_u64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: andl $-8, %esp -; AVX-NEXT: subl $8, %esp ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0] -; AVX-NEXT: vucomisd %xmm0, %xmm1 -; AVX-NEXT: jbe .LBB3_2 -; AVX-NEXT: # %bb.1: # %entry -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: .LBB3_2: # %entry -; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vmovsd %xmm1, (%esp) -; AVX-NEXT: fldl (%esp) -; AVX-NEXT: fisttpll (%esp) -; AVX-NEXT: setbe %al -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: shll $31, %eax -; AVX-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-NEXT: vaddsd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpeqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: movl %ebp, %esp -; AVX-NEXT: popl %ebp ; AVX-NEXT: retl ; ; AVX512-LABEL: oeq_f64_u64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm1 -; AVX512-NEXT: vcvtuqq2pd %ymm1, %ymm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpeqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retl entry: %0 = fptoui double %x to i64 @@ -282,8 +229,7 @@ define zeroext i8 @une_f64_i32(double %x) nounwind readnone { ; AVX-LABEL: une_f64_i32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vcvttpd2dq %xmm0, %xmm1 -; AVX-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax @@ -293,8 +239,7 @@ define zeroext i8 @une_f64_i32(double %x) nounwind readnone { ; AVX512-LABEL: une_f64_i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2dq %xmm0, %xmm1 -; AVX512-NEXT: vcvtdq2pd %xmm1, %xmm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -331,16 +276,7 @@ define zeroext i8 @une_f64_u32(double %x) nounwind readnone { ; AVX-LABEL: une_f64_u32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vcvttsd2si %xmm0, %eax -; AVX-NEXT: movl %eax, %ecx -; AVX-NEXT: sarl $31, %ecx -; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1 -; AVX-NEXT: vcvttsd2si %xmm1, %edx -; AVX-NEXT: andl %ecx, %edx -; AVX-NEXT: orl %eax, %edx -; AVX-NEXT: vmovd %edx, %xmm1 -; AVX-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; AVX-NEXT: vsubsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax @@ -350,8 +286,7 @@ define zeroext i8 @une_f64_u32(double %x) nounwind readnone { ; AVX512-LABEL: une_f64_u32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttsd2usi %xmm0, %eax -; AVX512-NEXT: vcvtusi2sd %eax, %xmm7, %xmm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax @@ -395,35 +330,21 @@ define zeroext i8 @une_f64_i64(double %x) nounwind readnone { ; ; AVX-LABEL: une_f64_i64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: andl $-8, %esp -; AVX-NEXT: subl $24, %esp ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd %xmm0, (%esp) -; AVX-NEXT: fldl (%esp) -; AVX-NEXT: fisttpll (%esp) -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovlps %xmm1, {{[0-9]+}}(%esp) -; AVX-NEXT: fildll {{[0-9]+}}(%esp) -; AVX-NEXT: fstpl {{[0-9]+}}(%esp) -; AVX-NEXT: vcmpneqsd {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 +; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: movl %ebp, %esp -; AVX-NEXT: popl %ebp ; AVX-NEXT: retl ; ; AVX512-LABEL: une_f64_i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2qq %xmm0, %xmm1 -; AVX512-NEXT: vcvtqq2pd %ymm1, %ymm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retl entry: %0 = fptosi double %x to i64 @@ -480,48 +401,21 @@ define zeroext i8 @une_f64_u64(double %x) nounwind readnone { ; ; AVX-LABEL: une_f64_u64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: pushl %ebp -; AVX-NEXT: movl %esp, %ebp -; AVX-NEXT: andl $-8, %esp -; AVX-NEXT: subl $8, %esp ; AVX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX-NEXT: vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0] -; AVX-NEXT: vucomisd %xmm0, %xmm1 -; AVX-NEXT: jbe .LBB7_2 -; AVX-NEXT: # %bb.1: # %entry -; AVX-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: .LBB7_2: # %entry -; AVX-NEXT: vsubsd %xmm1, %xmm0, %xmm1 -; AVX-NEXT: vmovsd %xmm1, (%esp) -; AVX-NEXT: fldl (%esp) -; AVX-NEXT: fisttpll (%esp) -; AVX-NEXT: setbe %al -; AVX-NEXT: movzbl %al, %eax -; AVX-NEXT: shll $31, %eax -; AVX-NEXT: xorl {{[0-9]+}}(%esp), %eax -; AVX-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1] -; AVX-NEXT: vsubpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1, %xmm1 -; AVX-NEXT: vshufpd {{.*#+}} xmm2 = xmm1[1,0] -; AVX-NEXT: vaddsd %xmm1, %xmm2, %xmm1 +; AVX-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX-NEXT: vcmpneqsd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: andl $1, %eax ; AVX-NEXT: # kill: def $al killed $al killed $eax -; AVX-NEXT: movl %ebp, %esp -; AVX-NEXT: popl %ebp ; AVX-NEXT: retl ; ; AVX512-LABEL: une_f64_u64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; AVX512-NEXT: vcvttpd2uqq %xmm0, %xmm1 -; AVX512-NEXT: vcvtuqq2pd %ymm1, %ymm1 +; AVX512-NEXT: vroundsd $11, %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vcmpneqsd %xmm0, %xmm1, %k0 ; AVX512-NEXT: kmovd %k0, %eax ; AVX512-NEXT: # kill: def $al killed $al killed $eax -; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retl entry: %0 = fptoui double %x to i64