diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index c89d9fae639a7..2423b95930319 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -1220,6 +1220,17 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) { } } + // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y + // This is useful for fmax/fmin without fast-math flags, where we need to + // check if any operand is NaN. + if (match(Def, m_BinaryOr(m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X), + m_Deferred(X)), + m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(Y), + m_Deferred(Y))))) { + VPValue *NewCmp = Builder.createFCmp(CmpInst::FCMP_UNO, X, Y); + return Def->replaceAllUsesWith(NewCmp); + } + // Remove redundant DerviedIVs, that is 0 + A * 1 -> A and 0 + 0 * x -> 0. if ((match(Def, m_DerivedIV(m_ZeroInt(), m_VPValue(A), m_One())) || match(Def, m_DerivedIV(m_ZeroInt(), m_ZeroInt(), m_VPValue()))) && diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll index 7e58d9d6a8ec9..f3d649b899686 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll @@ -142,14 +142,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]]) ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]] -; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]] -; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]] -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]] ; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll index 01fab87209a35..ca6e5bc2d0dcb 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll @@ -141,14 +141,10 @@ define float @test_fmax_and_fmin(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-NEXT: [[TMP6]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD5]]) ; CHECK-NEXT: [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD6]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 8 -; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]] -; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]] -; CHECK-NEXT: [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP15:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]] -; CHECK-NEXT: [[TMP12:%.*]] = or <4 x i1> [[TMP8]], [[TMP14]] -; CHECK-NEXT: [[TMP13:%.*]] = or <4 x i1> [[TMP9]], [[TMP15]] -; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP12]] -; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP13]] +; CHECK-NEXT: [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD4]] +; CHECK-NEXT: [[TMP16:%.*]] = freeze <4 x i1> [[TMP8]] +; CHECK-NEXT: [[TMP17:%.*]] = freeze <4 x i1> [[TMP9]] ; CHECK-NEXT: [[TMP18:%.*]] = or <4 x i1> [[TMP16]], [[TMP17]] ; CHECK-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll index e028bec138faf..a4f7631435bb3 100644 --- a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll +++ b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags.ll @@ -697,10 +697,8 @@ define float @test_fmax_and_fmax(ptr %src.0, ptr %src.1, i64 %n) { ; CHECK-NEXT: [[TMP2]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP3]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD2]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[IV]], 4 -; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]] -; CHECK-NEXT: [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]] -; CHECK-NEXT: [[TMP6:%.*]] = or <4 x i1> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP6]] +; CHECK-NEXT: [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD]] +; CHECK-NEXT: [[TMP8:%.*]] = freeze <4 x i1> [[TMP4]] ; CHECK-NEXT: [[TMP9:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP8]]) ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: [[TMP10:%.*]] = or i1 [[TMP9]], [[TMP11]]