llvm · fhahn · Nov 6, 2025 · Nov 15, 2025 · Nov 15, 2025 · ayalz
diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h
@@ -417,6 +417,10 @@ m_BranchOnCount(const Op0_t &Op0, const Op1_t &Op1) {
   return m_VPInstruction<VPInstruction::BranchOnCount>(Op0, Op1);
 }
 
+inline VPInstruction_match<VPInstruction::AnyOf> m_AnyOf() {
+  return m_VPInstruction<VPInstruction::AnyOf>();
+}
+
 template <typename Op0_t>
 inline VPInstruction_match<VPInstruction::AnyOf, Op0_t>
 m_AnyOf(const Op0_t &Op0) {

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1220,6 +1220,37 @@ static void simplifyRecipe(VPSingleDefRecipe *Def, VPTypeAnalysis &TypeInfo) {
     }
   }
 
+  // Fold any-of (fcmp uno %A, %A), (fcmp uno %B, %B), ... ->
+  //      any-of (fcmp uno %A, %B), ...
+  if (match(Def, m_AnyOf())) {
+    SmallVector<VPValue *, 4> NewOps;
+    VPRecipeBase *UnpairedCmp = nullptr;
+    for (VPValue *Op : Def->operands()) {
+      VPValue *X;
+      if (Op->getNumUsers() > 1 ||
+          !match(Op, m_SpecificCmp(CmpInst::FCMP_UNO, m_VPValue(X),
+                                   m_Deferred(X)))) {
+        NewOps.push_back(Op);
+        continue;
+      }
+      if (UnpairedCmp) {
+        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
+                                            UnpairedCmp->getOperand(0), X));
+        UnpairedCmp = nullptr;
+      } else {
+        UnpairedCmp = Op->getDefiningRecipe();
-        continue;
-      }
-      if (UnpairedCmp) {
-        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
-                                            UnpairedCmp->getOperand(0), X));
-        UnpairedCmp = nullptr;
-      } else {
-        UnpairedCmp = Op->getDefiningRecipe();
+      } else if (!UnpairedCmp) {
+        UnpairedCmp = Op->getDefiningRecipe();
+      } else {
+        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
+                                            UnpairedCmp->getOperand(0), X));
+        UnpairedCmp = nullptr;
+      }
-        continue;
-      }
-      if (UnpairedCmp) {
-        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
-                                            UnpairedCmp->getOperand(0), X));
-        UnpairedCmp = nullptr;
-      } else {
-        UnpairedCmp = Op->getDefiningRecipe();
+      } else if (!UnpairedCmp) {
+        UnpairedCmp = Op->getDefiningRecipe();
+      } else {
+        NewOps.push_back(Builder.createFCmp(CmpInst::FCMP_UNO,
+                                            UnpairedCmp->getOperand(0), X));
+        UnpairedCmp = nullptr;
+      }
+      }
+    }
+
+    if (UnpairedCmp)
+      NewOps.push_back(UnpairedCmp->getVPSingleValue());
+
+    if (NewOps.size() < Def->getNumOperands()) {
+      VPValue *NewAnyOf = Builder.createNaryOp(VPInstruction::AnyOf, NewOps);
+      return Def->replaceAllUsesWith(NewAnyOf);
+    }
+  }
+
   // Fold (fcmp uno %X, %X) or (fcmp uno %Y, %Y) -> fcmp uno %X, %Y
   // This is useful for fmax/fmin without fast-math flags, where we need to
   // check if any operand is NaN.

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmax-without-fast-math-flags.ll
@@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP18]], [[TMP15]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fmin-without-fast-math-flags.ll
@@ -59,11 +59,8 @@ define float @fminnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.minnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]

diff --git a/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll b/llvm/test/Transforms/LoopVectorize/fcmp-uno-fold-interleave.ll
@@ -28,14 +28,11 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; IC3-NEXT:    [[TMP4]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD3]])
 ; IC3-NEXT:    [[TMP5]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD4]])
 ; IC3-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 12
-; IC3-NEXT:    [[TMP6:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; IC3-NEXT:    [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD3]]
 ; IC3-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
-; IC3-NEXT:    [[TMP9:%.*]] = freeze <4 x i1> [[TMP6]]
+; IC3-NEXT:    [[TMP7:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD3]]
 ; IC3-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP7]]
-; IC3-NEXT:    [[TMP11:%.*]] = or <4 x i1> [[TMP9]], [[TMP10]]
 ; IC3-NEXT:    [[TMP12:%.*]] = freeze <4 x i1> [[TMP8]]
-; IC3-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP11]], [[TMP12]]
+; IC3-NEXT:    [[TMP13:%.*]] = or <4 x i1> [[TMP10]], [[TMP12]]
 ; IC3-NEXT:    [[TMP14:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP13]])
 ; IC3-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC3-NEXT:    [[TMP16:%.*]] = or i1 [[TMP14]], [[TMP15]]
@@ -86,17 +83,11 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; IC4-NEXT:    [[TMP6]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI2]], <4 x float> [[WIDE_LOAD5]])
 ; IC4-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD6]])
 ; IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; IC4-NEXT:    [[TMP8:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; IC4-NEXT:    [[TMP9:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD4]], [[WIDE_LOAD4]]
-; IC4-NEXT:    [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
-; IC4-NEXT:    [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
-; IC4-NEXT:    [[TMP10:%.*]] = freeze <4 x i1> [[TMP8]]
-; IC4-NEXT:    [[TMP11:%.*]] = freeze <4 x i1> [[TMP9]]
-; IC4-NEXT:    [[TMP12:%.*]] = or <4 x i1> [[TMP10]], [[TMP11]]
+; IC4-NEXT:    [[TMP24:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; IC4-NEXT:    [[TMP25:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD6]]
 ; IC4-NEXT:    [[TMP26:%.*]] = freeze <4 x i1> [[TMP24]]
-; IC4-NEXT:    [[TMP27:%.*]] = or <4 x i1> [[TMP12]], [[TMP26]]
 ; IC4-NEXT:    [[TMP28:%.*]] = freeze <4 x i1> [[TMP25]]
-; IC4-NEXT:    [[TMP29:%.*]] = or <4 x i1> [[TMP27]], [[TMP28]]
+; IC4-NEXT:    [[TMP29:%.*]] = or <4 x i1> [[TMP26]], [[TMP28]]
 ; IC4-NEXT:    [[TMP13:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP29]])
 ; IC4-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC4-NEXT:    [[TMP15:%.*]] = or i1 [[TMP13]], [[TMP14]]
@@ -153,18 +144,12 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; IC5-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI3]], <4 x float> [[WIDE_LOAD7]])
 ; IC5-NEXT:    [[TMP9]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI4]], <4 x float> [[WIDE_LOAD8]])
 ; IC5-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 20
-; IC5-NEXT:    [[TMP10:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; IC5-NEXT:    [[TMP11:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD5]], [[WIDE_LOAD5]]
-; IC5-NEXT:    [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD6]]
-; IC5-NEXT:    [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD7]], [[WIDE_LOAD7]]
 ; IC5-NEXT:    [[TMP14:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD8]], [[WIDE_LOAD8]]
-; IC5-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP10]]
-; IC5-NEXT:    [[TMP16:%.*]] = freeze <4 x i1> [[TMP11]]
-; IC5-NEXT:    [[TMP17:%.*]] = or <4 x i1> [[TMP15]], [[TMP16]]
+; IC5-NEXT:    [[TMP12:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD5]]
+; IC5-NEXT:    [[TMP13:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD6]], [[WIDE_LOAD7]]
 ; IC5-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP12]]
-; IC5-NEXT:    [[TMP19:%.*]] = or <4 x i1> [[TMP17]], [[TMP18]]
 ; IC5-NEXT:    [[TMP20:%.*]] = freeze <4 x i1> [[TMP13]]
-; IC5-NEXT:    [[TMP21:%.*]] = or <4 x i1> [[TMP19]], [[TMP20]]
+; IC5-NEXT:    [[TMP21:%.*]] = or <4 x i1> [[TMP18]], [[TMP20]]
 ; IC5-NEXT:    [[TMP22:%.*]] = freeze <4 x i1> [[TMP14]]
 ; IC5-NEXT:    [[TMP23:%.*]] = or <4 x i1> [[TMP21]], [[TMP22]]
 ; IC5-NEXT:    [[TMP24:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP23]])

diff --git a/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll b/llvm/test/Transforms/LoopVectorize/fmax-without-fast-math-flags-interleave.ll
@@ -59,11 +59,8 @@ define float @fmaxnum(ptr %src, i64 %n) {
 ; CHECK-NEXT:    [[TMP7]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI]], <4 x float> [[WIDE_LOAD]])
 ; CHECK-NEXT:    [[TMP8]] = call <4 x float> @llvm.maxnum.v4f32(<4 x float> [[VEC_PHI1]], <4 x float> [[WIDE_LOAD2]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP15:%.*]] = freeze <4 x i1> [[TMP3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = freeze <4 x i1> [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = or <4 x i1> [[TMP15]], [[TMP18]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fcmp uno <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = freeze <4 x i1> [[TMP4]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP5]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = or i1 [[TMP6]], [[TMP9]]