diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index a6ba6e518899f..c81568672de3c 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -17770,7 +17770,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { // N0 + -0.0 --> N0 (also allowed with +0.0 and fast-math) ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1, true); if (N1C && N1C->isZero()) - if (N1C->isNegative() || Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros()) + if (N1C->isNegative() || Flags.hasNoSignedZeros()) return N0; if (SDValue NewSel = foldBinOpIntoSelect(N)) @@ -17823,11 +17823,10 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { return DAG.getConstantFP(0.0, DL, VT); } - // If 'unsafe math' or reassoc and nsz, fold lots of things. + // If reassoc and nsz, fold lots of things. // TODO: break out portions of the transformations below for which Unsafe is // considered and which do not require both nsz and reassoc - if ((Options.NoSignedZerosFPMath || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros())) && + if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros() && AllowNewConst) { // fadd (fadd x, c1), c2 -> fadd x, c1 + c2 if (N1CFP && N0.getOpcode() == ISD::FADD && @@ -17911,10 +17910,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) { DAG.getConstantFP(4.0, DL, VT)); } } - } // enable-unsafe-fp-math && AllowNewConst + } // reassoc && nsz && AllowNewConst - if ((Options.NoSignedZerosFPMath || - (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()))) { + if (Flags.hasAllowReassociation() && Flags.hasNoSignedZeros()) { // Fold fadd(vecreduce(x), vecreduce(y)) -> vecreduce(fadd(x, y)) if (SDValue SD = reassociateReduction(ISD::VECREDUCE_FADD, ISD::FADD, DL, VT, N0, N1, Flags)) diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index 462d7748b86cd..b14e8c44ffcce 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -581,145 +581,63 @@ define { half, half } @v_fneg_add_multi_use_fneg_x_f16(half %a, half %b, half %c ret { half, half } %insert.1 } -; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { -; SI-SAFE-LABEL: fneg_fadd_0_f16: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-SAFE-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v3, v2 -; SI-SAFE-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-SAFE-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-SAFE-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-SAFE-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-SAFE-NEXT: v_fma_f32 v2, -v2, v5, v4 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_f16: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v3, v2 -; SI-NSZ-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v5, -v2, v3, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v3, v5, v3, v3 -; SI-NSZ-NEXT: v_mul_f32_e32 v5, v4, v3 -; SI-NSZ-NEXT: v_fma_f32 v6, -v2, v5, v4 -; SI-NSZ-NEXT: v_fma_f32 v5, v6, v3, v5 -; SI-NSZ-NEXT: v_fma_f32 v2, -v2, v5, v4 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v2, v2, v3, v5 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_f16: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_rcp_f16_e32 v0, s1 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; VI-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; VI-SAFE-NEXT: v_xor_b32_e32 v2, 0x8000, v0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0_f16: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog -; -; GFX11-SAFE-LABEL: fneg_fadd_0_f16: -; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-SAFE-NEXT: s_waitcnt_depctr 0xfff -; GFX11-SAFE-NEXT: v_mul_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_add_f16_e32 v0, 0, v0 -; GFX11-SAFE-NEXT: v_xor_b32_e32 v1, 0x8000, v0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-SAFE-NEXT: ; return to shader part epilog -; -; GFX11-NSZ-LABEL: fneg_fadd_0_f16: -; GFX11-NSZ: ; %bb.0: ; %.entry -; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-NSZ-NEXT: ; return to shader part epilog -; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry -; GFX11-SAFE-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-SAFE-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-SAFE-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_add_f16_e32 v0.l, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0.l -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_xor_b32_e32 v0, 0x8000, v1 -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0/*Invalid register, operand has 'VS_16' register class*/, s0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog -; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_f16: -; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry -; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0 -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog +define amdgpu_ps half @fneg_fadd_0_safe_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #0 { +; SI-LABEL: fneg_fadd_0_safe_f16: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 2, 2), 0 +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, 1.0 +; SI-NEXT: v_rcp_f32_e32 v3, v2 +; SI-NEXT: v_div_scale_f32 v4, vcc, 1.0, v0, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v5, -v2, v3, 1.0 +; SI-NEXT: v_fma_f32 v3, v5, v3, v3 +; SI-NEXT: v_mul_f32_e32 v5, v4, v3 +; SI-NEXT: v_fma_f32 v6, -v2, v5, v4 +; SI-NEXT: v_fma_f32 v5, v6, v3, v5 +; SI-NEXT: v_fma_f32 v2, -v2, v5, v4 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v2, v2, v3, v5 +; SI-NEXT: v_div_fixup_f32 v0, v2, v0, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, v0, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_safe_f16: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mul_f16_e32 v0, 0, v0 +; VI-NEXT: v_add_f16_e32 v0, 0, v0 +; VI-NEXT: v_xor_b32_e32 v2, 0x8000, v0 +; VI-NEXT: v_cmp_ngt_f16_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x7e00 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fneg_fadd_0_safe_f16: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: v_rcp_f16_e32 v0, s1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_f16_e32 v0, 0, v0 +; GFX11-NEXT: v_xor_b32_e32 v1, 0x8000, v0 +; GFX11-NEXT: v_cmp_ngt_f16_e32 vcc_lo, s0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, v1, s0, vcc_lo +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv half 1.000000e+00, %tmp6 %tmp8 = fmul half 0.000000e+00, %tmp7 @@ -733,108 +651,51 @@ define amdgpu_ps half @fneg_fadd_0_f16(half inreg %tmp2, half inreg %tmp6, <4 x ret half %.i198 } -; This is a workaround because -enable-no-signed-zeros-fp-math does not set up -; function attribute unsafe-fp-math automatically. Combine with the previous test -; when that is done. define amdgpu_ps half @fneg_fadd_0_nsz_f16(half inreg %tmp2, half inreg %tmp6, <4 x i32> %arg) #2 { -; SI-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_cvt_f16_f32_e32 v0, s0 -; SI-SAFE-NEXT: s_brev_b32 s0, 1 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-SAFE-NEXT: v_min_legacy_f32_e32 v0, 0, v0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v0, s1 -; SI-NSZ-NEXT: v_cvt_f16_f32_e32 v1, s0 -; SI-NSZ-NEXT: v_mov_b32_e32 v2, 0x7fc00000 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v0, v0 -; SI-NSZ-NEXT: v_cvt_f32_f16_e32 v1, v1 -; SI-NSZ-NEXT: v_rcp_f32_e32 v0, v0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_mov_b32_e32 v0, 0x8000 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; VI-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc, s0, 0 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; VI-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7e00 -; VI-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog -; -; GFX11-SAFE-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-SAFE: ; %bb.0: ; %.entry -; GFX11-SAFE-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-SAFE-NEXT: v_cmp_ngt_f16_e64 vcc_lo, s0, 0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-NEXT: v_cndmask_b32_e32 v0, 0x8000, v0, vcc_lo -; GFX11-SAFE-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-SAFE-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-SAFE-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-SAFE-NEXT: ; return to shader part epilog -; -; GFX11-NSZ-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-NSZ: ; %bb.0: ; %.entry -; GFX11-NSZ-NEXT: v_rcp_f16_e32 v0, s1 -; GFX11-NSZ-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-NEXT: v_mul_f16_e32 v0, 0x8000, v0 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 -; GFX11-NSZ-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 -; GFX11-NSZ-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo -; GFX11-NSZ-NEXT: ; return to shader part epilog -; GFX11-SAFE-TRUE16-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-SAFE-TRUE16: ; %bb.0: ; %.entry -; GFX11-SAFE-TRUE16-NEXT: v_cmp_ngt_f16_e64 s1, s0, 0 -; GFX11-SAFE-TRUE16-NEXT: v_mov_b16_e32 v0.l, 0x8000 -; GFX11-SAFE-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-SAFE-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-SAFE-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-SAFE-TRUE16-NEXT: ; return to shader part epilog -; GFX11-NSZ-TRUE16-LABEL: fneg_fadd_0_nsz_f16: -; GFX11-NSZ-TRUE16: ; %bb.0: ; %.entry -; GFX11-NSZ-TRUE16-NEXT: v_rcp_f16_e32 v0.l, s1 -; GFX11-NSZ-TRUE16-NEXT: s_waitcnt_depctr 0xfff -; GFX11-NSZ-TRUE16-NEXT: v_mul_f16_e32 v0.l, 0x8000, v0.l -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e64 s1, -v0.l, s0 -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, v0.l, s0, s1 -; GFX11-NSZ-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NSZ-TRUE16-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0.l -; GFX11-NSZ-TRUE16-NEXT: v_cndmask_b16 v0.l, 0x7e00, 0, vcc_lo -; GFX11-NSZ-TRUE16-NEXT: ; return to shader part epilog +; SI-LABEL: fneg_fadd_0_nsz_f16: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_cvt_f16_f32_e32 v0, s1 +; SI-NEXT: v_cvt_f16_f32_e32 v1, s0 +; SI-NEXT: v_mov_b32_e32 v2, 0x7fc00000 +; SI-NEXT: v_cvt_f32_f16_e32 v0, v0 +; SI-NEXT: v_cvt_f32_f16_e32 v1, v1 +; SI-NEXT: v_rcp_f32_e32 v0, v0 +; SI-NEXT: v_mul_f32_e32 v0, 0x80000000, v0 +; SI-NEXT: v_cmp_nlt_f32_e64 vcc, -v0, v1 +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v2, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_nsz_f16: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f16_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mul_f16_e32 v0, 0x8000, v0 +; VI-NEXT: v_cmp_nlt_f16_e64 vcc, -v0, s0 +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; VI-NEXT: v_mov_b32_e32 v1, 0x7e00 +; VI-NEXT: v_cmp_nlt_f16_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: fneg_fadd_0_nsz_f16: +; GFX11: ; %bb.0: ; %.entry +; GFX11-NEXT: v_rcp_f16_e32 v0, s1 +; GFX11-NEXT: s_waitcnt_depctr 0xfff +; GFX11-NEXT: v_mul_f16_e32 v0, 0x8000, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_nlt_f16_e64 s1, -v0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_nlt_f16_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0x7e00, 0, vcc_lo +; GFX11-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn half 1.000000e+00, %tmp6 %tmp8 = fmul contract half 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract half 0.000000e+00, %tmp8 - %.i188 = fadd nnan ninf contract half %tmp9, 0.000000e+00 + %.i188 = fadd nsz half %tmp9, 0.000000e+00 %tmp10 = fcmp uge half %.i188, %tmp2 %tmp11 = fneg half %.i188 %.i092 = select i1 %tmp10, half %tmp2, half %tmp11 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll index ba34e9245f39c..12e9888314fc1 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll @@ -880,102 +880,54 @@ define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(ptr addrspace(1) %out } ; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { -; SI-SAFE-LABEL: fneg_fadd_0: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-SAFE-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-NSZ-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-SAFE-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-SAFE-NEXT: v_rcp_f32_e32 v2, v0 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-SAFE-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-SAFE-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-SAFE-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog -; -; VI-NSZ-LABEL: fneg_fadd_0: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-NSZ-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-NSZ-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-NSZ-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_safe(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #0 { +; SI-LABEL: fneg_fadd_0_safe: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v1, v0 +; SI-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v3, -v0, v1, 1.0 +; SI-NEXT: v_fma_f32 v1, v3, v1, v1 +; SI-NEXT: v_mul_f32_e32 v3, v2, v1 +; SI-NEXT: v_fma_f32 v4, -v0, v3, v2 +; SI-NEXT: v_fma_f32 v3, v4, v1, v3 +; SI-NEXT: v_fma_f32 v0, -v0, v3, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; SI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: fneg_fadd_0_safe: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; VI-NEXT: v_fma_f32 v2, v3, v2, v2 +; VI-NEXT: v_mul_f32_e32 v3, v1, v2 +; VI-NEXT: v_fma_f32 v4, -v0, v3, v1 +; VI-NEXT: v_fma_f32 v3, v4, v2, v3 +; VI-NEXT: v_fma_f32 v0, -v0, v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; VI-NEXT: v_mad_f32 v0, v0, 0, 0 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; VI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -989,39 +941,23 @@ define amdgpu_ps float @fneg_fadd_0(float inreg %tmp2, float inreg %tmp6, <4 x i ret float %.i198 } -; This is a workaround because -enable-no-signed-zeros-fp-math does not set up -; function attribute unsafe-fp-math automatically. Combine with the previous test -; when that is done. -define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr #2 { -; GCN-SAFE-LABEL: fneg_fadd_0_nsz: -; GCN-SAFE: ; %bb.0: ; %.entry -; GCN-SAFE-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-SAFE-NEXT: ; return to shader part epilog -; -; GCN-NSZ-LABEL: fneg_fadd_0_nsz: -; GCN-NSZ: ; %bb.0: ; %.entry -; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-NSZ-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_nsz(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) local_unnamed_addr { +; GCN-LABEL: fneg_fadd_0_nsz: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rcp_f32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GCN-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 - %.i188 = fadd float %tmp9, 0.000000e+00 + %.i188 = fadd nsz float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 @@ -8072,3 +8008,6 @@ attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" attributes #1 = { nounwind readnone } attributes #2 = { nounwind "unsafe-fp-math"="true" } attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN-NSZ: {{.*}} +; GCN-SAFE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll index e687745469014..c4ca79dc85312 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll @@ -175,103 +175,54 @@ define { float, float } @v_fneg_add_multi_use_fneg_x_f32(float %a, float %b, flo ret { float, float } %insert.1 } -; This one asserted with -enable-no-signed-zeros-fp-math -define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #0 { -; SI-SAFE-LABEL: fneg_fadd_0_f32: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-SAFE-NEXT: v_rcp_f32_e32 v1, v0 -; SI-SAFE-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-SAFE-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-SAFE-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-SAFE-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-SAFE-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; SI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_f32: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; SI-NSZ-NEXT: v_rcp_f32_e32 v1, v0 -; SI-NSZ-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; SI-NSZ-NEXT: v_fma_f32 v3, -v0, v1, 1.0 -; SI-NSZ-NEXT: v_fma_f32 v1, v3, v1, v1 -; SI-NSZ-NEXT: v_mul_f32_e32 v3, v2, v1 -; SI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v2 -; SI-NSZ-NEXT: v_fma_f32 v3, v4, v1, v3 -; SI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v2 -; SI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; SI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v1, v3 -; SI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; SI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; SI-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; SI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; SI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; SI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; SI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_f32: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-SAFE-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-SAFE-NEXT: v_rcp_f32_e32 v2, v0 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-SAFE-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-SAFE-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-SAFE-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-SAFE-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-SAFE-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-SAFE-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-SAFE-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-SAFE-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s0 -; VI-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-SAFE-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-SAFE-NEXT: v_mad_f32 v0, v0, 0, 0 -; VI-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-SAFE-NEXT: ; return to shader part epilog +define amdgpu_ps float @fneg_fadd_0_safe_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #0 { +; SI-LABEL: fneg_fadd_0_safe_f32: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; SI-NEXT: v_rcp_f32_e32 v1, v0 +; SI-NEXT: v_div_scale_f32 v2, vcc, 1.0, s1, 1.0 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; SI-NEXT: v_fma_f32 v3, -v0, v1, 1.0 +; SI-NEXT: v_fma_f32 v1, v3, v1, v1 +; SI-NEXT: v_mul_f32_e32 v3, v2, v1 +; SI-NEXT: v_fma_f32 v4, -v0, v3, v2 +; SI-NEXT: v_fma_f32 v3, v4, v1, v3 +; SI-NEXT: v_fma_f32 v0, -v0, v3, v2 +; SI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; SI-NEXT: v_div_fmas_f32 v0, v0, v1, v3 +; SI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; SI-NEXT: v_mad_f32 v0, v0, 0, 0 +; SI-NEXT: v_mov_b32_e32 v1, s0 +; SI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; SI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; SI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; SI-NEXT: ; return to shader part epilog ; -; VI-NSZ-LABEL: fneg_fadd_0_f32: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 -; VI-NSZ-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 -; VI-NSZ-NEXT: v_rcp_f32_e32 v2, v0 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 -; VI-NSZ-NEXT: v_fma_f32 v3, -v0, v2, 1.0 -; VI-NSZ-NEXT: v_fma_f32 v2, v3, v2, v2 -; VI-NSZ-NEXT: v_mul_f32_e32 v3, v1, v2 -; VI-NSZ-NEXT: v_fma_f32 v4, -v0, v3, v1 -; VI-NSZ-NEXT: v_fma_f32 v3, v4, v2, v3 -; VI-NSZ-NEXT: v_fma_f32 v0, -v0, v3, v1 -; VI-NSZ-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 -; VI-NSZ-NEXT: v_div_fmas_f32 v0, v0, v2, v3 -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s0 -; VI-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; VI-NSZ-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 -; VI-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; VI-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; VI-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; VI-NSZ-NEXT: ; return to shader part epilog +; VI-LABEL: fneg_fadd_0_safe_f32: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_div_scale_f32 v0, s[2:3], s1, s1, 1.0 +; VI-NEXT: v_div_scale_f32 v1, vcc, 1.0, s1, 1.0 +; VI-NEXT: v_rcp_f32_e32 v2, v0 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 3 +; VI-NEXT: v_fma_f32 v3, -v0, v2, 1.0 +; VI-NEXT: v_fma_f32 v2, v3, v2, v2 +; VI-NEXT: v_mul_f32_e32 v3, v1, v2 +; VI-NEXT: v_fma_f32 v4, -v0, v3, v1 +; VI-NEXT: v_fma_f32 v3, v4, v2, v3 +; VI-NEXT: v_fma_f32 v0, -v0, v3, v1 +; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 +; VI-NEXT: v_div_fmas_f32 v0, v0, v2, v3 +; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; VI-NEXT: v_div_fixup_f32 v0, v0, s1, 1.0 +; VI-NEXT: v_mad_f32 v0, v0, 0, 0 +; VI-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, -v0, v2, vcc +; VI-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 @@ -289,35 +240,22 @@ define amdgpu_ps float @fneg_fadd_0_f32(float inreg %tmp2, float inreg %tmp6, <4 ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps float @fneg_fadd_0_nsz_f32(float inreg %tmp2, float inreg %tmp6, <4 x i32> %arg) #2 { -; GCN-SAFE-LABEL: fneg_fadd_0_nsz_f32: -; GCN-SAFE: ; %bb.0: ; %.entry -; GCN-SAFE-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, s0 -; GCN-SAFE-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_add_f32_e32 v0, 0, v0 -; GCN-SAFE-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-SAFE-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-SAFE-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-SAFE-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-SAFE-NEXT: ; return to shader part epilog -; -; GCN-NSZ-LABEL: fneg_fadd_0_nsz_f32: -; GCN-NSZ: ; %bb.0: ; %.entry -; GCN-NSZ-NEXT: v_rcp_f32_e32 v0, s1 -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, s0 -; GCN-NSZ-NEXT: v_mul_f32_e32 v0, 0, v0 -; GCN-NSZ-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc -; GCN-NSZ-NEXT: v_mov_b32_e32 v1, 0x7fc00000 -; GCN-NSZ-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 -; GCN-NSZ-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc -; GCN-NSZ-NEXT: ; return to shader part epilog +; GCN-LABEL: fneg_fadd_0_nsz_f32: +; GCN: ; %bb.0: ; %.entry +; GCN-NEXT: v_rcp_f32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s0 +; GCN-NEXT: v_mul_f32_e32 v0, 0, v0 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, -v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, 0, vcc +; GCN-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn float 1.000000e+00, %tmp6 %tmp8 = fmul float 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract float 0.000000e+00, %tmp8 - %.i188 = fadd float %tmp9, 0.000000e+00 + %.i188 = fadd nsz float %tmp9, 0.000000e+00 %tmp10 = fcmp uge float %.i188, %tmp2 %tmp11 = fneg float %.i188 %.i092 = select i1 %tmp10, float %tmp2, float %tmp11 @@ -569,8 +507,6 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-NSZ-LABEL: fneg_fadd_0_f64: ; SI-NSZ: ; %bb.0: ; %.entry ; SI-NSZ-NEXT: v_div_scale_f64 v[0:1], s[4:5], s[2:3], s[2:3], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s4, 0 -; SI-NSZ-NEXT: s_brev_b32 s5, 1 ; SI-NSZ-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] ; SI-NSZ-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 ; SI-NSZ-NEXT: v_fma_f64 v[2:3], v[2:3], v[4:5], v[2:3] @@ -583,7 +519,10 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 ; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 ; SI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[4:5] +; SI-NSZ-NEXT: s_mov_b32 s2, 0 +; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; SI-NSZ-NEXT: s_brev_b32 s3, 1 +; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[0:1], s[2:3], s[2:3] ; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -637,7 +576,8 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; VI-NSZ-NEXT: v_div_fixup_f64 v[0:1], v[0:1], s[2:3], 1.0 ; VI-NSZ-NEXT: s_mov_b32 s2, 0 ; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], 0 +; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[0:1], s[2:3], s[2:3] ; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] ; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc @@ -663,102 +603,56 @@ define amdgpu_ps double @fneg_fadd_0_f64(double inreg %tmp2, double inreg %tmp6, ; function attribute unsafe-fp-math automatically. Combine with the previous test ; when that is done. define amdgpu_ps double @fneg_fadd_0_nsz_f64(double inreg %tmp2, double inreg %tmp6, <4 x i32> %arg) #2 { -; SI-SAFE-LABEL: fneg_fadd_0_nsz_f64: -; SI-SAFE: ; %bb.0: ; %.entry -; SI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-SAFE-NEXT: v_mov_b32_e32 v2, s1 -; SI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; SI-SAFE-NEXT: v_mov_b32_e32 v3, s0 -; SI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; SI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] -; SI-SAFE-NEXT: v_xor_b32_e32 v4, 0x80000000, v1 -; SI-SAFE-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; SI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; SI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; SI-SAFE-NEXT: s_mov_b32 s0, 0 -; SI-SAFE-NEXT: ; return to shader part epilog -; -; SI-NSZ-LABEL: fneg_fadd_0_nsz_f64: -; SI-NSZ: ; %bb.0: ; %.entry -; SI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; SI-NSZ-NEXT: s_mov_b32 s2, 0 -; SI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; SI-NSZ-NEXT: s_brev_b32 s3, 1 -; SI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] -; SI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; SI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; SI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; SI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; SI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; SI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; SI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; SI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; SI-NSZ-NEXT: s_mov_b32 s0, 0 -; SI-NSZ-NEXT: ; return to shader part epilog -; -; VI-SAFE-LABEL: fneg_fadd_0_nsz_f64: -; VI-SAFE: ; %bb.0: ; %.entry -; VI-SAFE-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; VI-SAFE-NEXT: v_mov_b32_e32 v4, s0 -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-SAFE-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-SAFE-NEXT: v_mov_b32_e32 v2, s1 -; VI-SAFE-NEXT: v_mul_f64 v[0:1], v[0:1], 0 -; VI-SAFE-NEXT: v_add_f64 v[0:1], v[0:1], 0 -; VI-SAFE-NEXT: v_cmp_ngt_f64_e32 vcc, s[0:1], v[0:1] -; VI-SAFE-NEXT: v_xor_b32_e32 v3, 0x80000000, v1 -; VI-SAFE-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc -; VI-SAFE-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; VI-SAFE-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; VI-SAFE-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-SAFE-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; VI-SAFE-NEXT: s_mov_b32 s0, 0 -; VI-SAFE-NEXT: ; return to shader part epilog +; SI-LABEL: fneg_fadd_0_nsz_f64: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; SI-NEXT: s_brev_b32 s3, 1 +; SI-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; SI-NEXT: v_mov_b32_e32 v2, s1 +; SI-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; SI-NEXT: v_mov_b32_e32 v3, s0 +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; SI-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; SI-NEXT: s_and_b64 s[0:1], vcc, exec +; SI-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; SI-NEXT: s_mov_b32 s0, 0 +; SI-NEXT: ; return to shader part epilog ; -; VI-NSZ-LABEL: fneg_fadd_0_nsz_f64: -; VI-NSZ: ; %bb.0: ; %.entry -; VI-NSZ-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 -; VI-NSZ-NEXT: s_mov_b32 s2, 0 -; VI-NSZ-NEXT: s_brev_b32 s3, 1 -; VI-NSZ-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] -; VI-NSZ-NEXT: v_mov_b32_e32 v2, s1 -; VI-NSZ-NEXT: v_mov_b32_e32 v3, s0 -; VI-NSZ-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] -; VI-NSZ-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] -; VI-NSZ-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc -; VI-NSZ-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc -; VI-NSZ-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] -; VI-NSZ-NEXT: s_and_b64 s[0:1], vcc, exec -; VI-NSZ-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 -; VI-NSZ-NEXT: s_mov_b32 s0, 0 -; VI-NSZ-NEXT: ; return to shader part epilog +; VI-LABEL: fneg_fadd_0_nsz_f64: +; VI: ; %bb.0: ; %.entry +; VI-NEXT: v_rcp_f64_e32 v[0:1], s[2:3] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_fma_f64 v[2:3], -s[2:3], v[0:1], 1.0 +; VI-NEXT: s_mov_b32 s2, 0 +; VI-NEXT: s_brev_b32 s3, 1 +; VI-NEXT: v_fma_f64 v[0:1], v[2:3], v[0:1], v[0:1] +; VI-NEXT: v_mov_b32_e32 v2, s1 +; VI-NEXT: v_mov_b32_e32 v3, s0 +; VI-NEXT: v_mul_f64 v[0:1], v[0:1], s[2:3] +; VI-NEXT: v_cmp_nlt_f64_e64 vcc, -v[0:1], s[0:1] +; VI-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; VI-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; VI-NEXT: v_cmp_nlt_f64_e32 vcc, 0, v[0:1] +; VI-NEXT: s_and_b64 s[0:1], vcc, exec +; VI-NEXT: s_cselect_b32 s1, 0, 0x7ff80000 +; VI-NEXT: s_mov_b32 s0, 0 +; VI-NEXT: ; return to shader part epilog .entry: %tmp7 = fdiv afn double 1.000000e+00, %tmp6 %tmp8 = fmul double 0.000000e+00, %tmp7 %tmp9 = fmul reassoc nnan arcp contract double 0.000000e+00, %tmp8 - %.i188 = fadd double %tmp9, 0.000000e+00 + %.i188 = fadd nsz double %tmp9, 0.000000e+00 %tmp10 = fcmp uge double %.i188, %tmp2 %tmp11 = fneg double %.i188 %.i092 = select i1 %tmp10, double %tmp2, double %tmp11 diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll index 1082177e3da19..2c06c538ae10d 100644 --- a/llvm/test/CodeGen/X86/fadd-combines.ll +++ b/llvm/test/CodeGen/X86/fadd-combines.ll @@ -5,7 +5,7 @@ define float @fadd_zero_f32(float %x) #0 { ; CHECK-LABEL: fadd_zero_f32: ; CHECK: # %bb.0: ; CHECK-NEXT: retq - %y = fadd float %x, 0.0 + %y = fadd nsz float %x, 0.0 ret float %y } @@ -13,7 +13,7 @@ define <4 x float> @fadd_zero_4f32(<4 x float> %x) #0 { ; CHECK-LABEL: fadd_zero_4f32: ; CHECK: # %bb.0: ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, zeroinitializer + %y = fadd nsz <4 x float> %x, zeroinitializer ret <4 x float> %y } @@ -31,8 +31,8 @@ define float @fadd_2const_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, 1.0 - %z = fadd float %y, 2.0 + %y = fadd reassoc nsz float %x, 1.0 + %z = fadd reassoc nsz float %y, 2.0 ret float %z } @@ -45,8 +45,8 @@ define <4 x float> @fadd_2const_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: addps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, - %z = fadd <4 x float> %y, + %y = fadd reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %y, ret <4 x float> %z } @@ -56,8 +56,8 @@ define float @fadd_x_fmul_x_c_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul float %x, 2.0 - %z = fadd float %x, %y + %y = fmul reassoc nsz float %x, 2.0 + %z = fadd reassoc nsz float %x, %y ret float %z } @@ -70,8 +70,8 @@ define <4 x float> @fadd_x_fmul_x_c_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul <4 x float> %x, - %z = fadd <4 x float> %x, %y + %y = fmul reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %x, %y ret <4 x float> %z } @@ -81,8 +81,8 @@ define float @fadd_fmul_x_c_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul float %x, 2.0 - %z = fadd float %y, %x + %y = fmul reassoc nsz float %x, 2.0 + %z = fadd reassoc nsz float %y, %x ret float %z } @@ -95,8 +95,8 @@ define <4 x float> @fadd_fmul_x_c_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fmul <4 x float> %x, - %z = fadd <4 x float> %y, %x + %y = fmul reassoc nsz <4 x float> %x, + %z = fadd reassoc nsz <4 x float> %y, %x ret <4 x float> %z } @@ -106,9 +106,9 @@ define float @fadd_fadd_x_x_fmul_x_c_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fmul float %x, 2.0 - %w = fadd float %y, %z + %y = fadd reassoc nsz float %x, %x + %z = fmul reassoc nsz float %x, 2.0 + %w = fadd reassoc nsz float %y, %z ret float %w } @@ -121,9 +121,9 @@ define <4 x float> @fadd_fadd_x_x_fmul_x_c_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fmul <4 x float> %x, - %w = fadd <4 x float> %y, %z + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fmul reassoc nsz <4 x float> %x, + %w = fadd reassoc nsz <4 x float> %y, %z ret <4 x float> %w } @@ -133,9 +133,9 @@ define float @fadd_fmul_x_c_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fmul float %x, 2.0 - %w = fadd float %z, %y + %y = fadd reassoc nsz float %x, %x + %z = fmul reassoc nsz float %x, 2.0 + %w = fadd reassoc nsz float %z, %y ret float %w } @@ -148,9 +148,9 @@ define <4 x float> @fadd_fmul_x_c_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fmul <4 x float> %x, - %w = fadd <4 x float> %z, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fmul reassoc nsz <4 x float> %x, + %w = fadd reassoc nsz <4 x float> %z, %y ret <4 x float> %w } @@ -160,8 +160,8 @@ define float @fadd_x_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %x, %y + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %x, %y ret float %z } @@ -174,8 +174,8 @@ define <4 x float> @fadd_x_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %x, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %x, %y ret <4 x float> %z } @@ -185,8 +185,8 @@ define float @fadd_fadd_x_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %y, %x + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %y, %x ret float %z } @@ -199,8 +199,8 @@ define <4 x float> @fadd_fadd_x_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %y, %x + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %y, %x ret <4 x float> %z } @@ -210,8 +210,8 @@ define float @fadd_fadd_x_x_fadd_x_x_f32(float %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd float %x, %x - %z = fadd float %y, %y + %y = fadd reassoc nsz float %x, %x + %z = fadd reassoc nsz float %y, %y ret float %z } @@ -224,8 +224,8 @@ define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 { ; CHECK: # %bb.0: ; CHECK-NEXT: mulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: retq - %y = fadd <4 x float> %x, %x - %z = fadd <4 x float> %y, %y + %y = fadd reassoc nsz <4 x float> %x, %x + %z = fadd reassoc nsz <4 x float> %y, %y ret <4 x float> %z } @@ -241,9 +241,9 @@ define float @fadd_const_multiuse_attr(float %x) #0 { ; CHECK-NEXT: addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; CHECK-NEXT: addss %xmm1, %xmm0 ; CHECK-NEXT: retq - %a1 = fadd float %x, 42.0 - %a2 = fadd float %a1, 17.0 - %a3 = fadd float %a1, %a2 + %a1 = fadd reassoc nsz float %x, 42.0 + %a2 = fadd reassoc nsz float %a1, 17.0 + %a3 = fadd reassoc nsz float %a1, %a2 ret float %a3 } @@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do ret <2 x double> %sub } -attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" } +attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }