From 656a2a5c07020fc07e9e67507a5942d60c40438a Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Thu, 28 Aug 2025 01:42:46 +0200
Subject: [PATCH 1/2] wasm: recognize `any_true` and `all_true`

---
 .../include/llvm/Target/TargetSelectionDAG.td |   3 +
 .../WebAssembly/WebAssemblyISelLowering.cpp   |  10 ++
 .../WebAssembly/WebAssemblyInstrSIMD.td       |  10 ++
 .../WebAssemblyTargetTransformInfo.cpp        |  16 +++
 .../WebAssemblyTargetTransformInfo.h          |   1 +
 llvm/test/CodeGen/WebAssembly/any-all-true.ll | 125 ++++++++++++++++++
 6 files changed, 165 insertions(+)
 create mode 100644 llvm/test/CodeGen/WebAssembly/any-all-true.ll

diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index a4ed62bb5715c..69aa748f0f4f1 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -511,6 +511,9 @@ def vecreduce_smax  : SDNode<"ISD::VECREDUCE_SMAX", SDTVecReduce>;
 def vecreduce_umax  : SDNode<"ISD::VECREDUCE_UMAX", SDTVecReduce>;
 def vecreduce_smin  : SDNode<"ISD::VECREDUCE_SMIN", SDTVecReduce>;
 def vecreduce_umin  : SDNode<"ISD::VECREDUCE_UMIN", SDTVecReduce>;
+def vecreduce_and  : SDNode<"ISD::VECREDUCE_AND", SDTVecReduce>;
+def vecreduce_or : SDNode<"ISD::VECREDUCE_OR", SDTVecReduce>;
+def vecreduce_xor: SDNode<"ISD::VECREDUCE_XOR", SDTVecReduce>;
 def vecreduce_fadd  : SDNode<"ISD::VECREDUCE_FADD", SDTFPVecReduce>;
 def vecreduce_fmin  : SDNode<"ISD::VECREDUCE_FMIN", SDTFPVecReduce>;
 def vecreduce_fmax  : SDNode<"ISD::VECREDUCE_FMAX", SDTFPVecReduce>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 35d5c3ed90c91..6e3aab4094459 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -281,6 +281,16 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setOperationAction(ISD::CTLZ, MVT::v16i8, Expand);
     setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
 
+    setOperationAction(ISD::VECREDUCE_AND, MVT::v16i8, Legal);
+    setOperationAction(ISD::VECREDUCE_AND, MVT::v8i16, Legal);
+    setOperationAction(ISD::VECREDUCE_AND, MVT::v4i32, Legal);
+    setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Legal);
+
+    setOperationAction(ISD::VECREDUCE_OR, MVT::v16i8, Legal);
+    setOperationAction(ISD::VECREDUCE_OR, MVT::v8i16, Legal);
+    setOperationAction(ISD::VECREDUCE_OR, MVT::v4i32, Legal);
+    setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Legal);
+
     // Custom lower bit counting operations for other types to scalarize them.
     for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP})
       for (auto T : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 143298b700928..d129313115032 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -997,6 +997,16 @@ def : Pat<(i32 (setne (i32 (intrinsic (vec.vt V128:$x))), (i32 0))), (inst $x)>;
 def : Pat<(i32 (seteq (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
 }
 
+def : Pat<(i32 (setcc (and (i32 (vecreduce_and (v16i8 V128:$vec))), (i32 255)), (i32 255), SETEQ)), (ALLTRUE_I8x16 V128:$vec)>;
+def : Pat<(i32 (setcc (and (i32 (vecreduce_and (v8i16 V128:$vec))), (i32 65535)), (i32 65535), SETEQ)), (ALLTRUE_I16x8 V128:$vec)>;
+def : Pat<(i32 (setcc (i32 (vecreduce_and(v4i32 V128:$vec))), (i32 -1), SETEQ)), (ALLTRUE_I32x4 V128:$vec)>;
+def : Pat<(i32 (setcc (i64 (vecreduce_and(v2i64 V128:$vec))), (i64 -1), SETEQ)), (ALLTRUE_I64x2 V128:$vec)>;
+
+def : Pat<(i32 (setcc (and (i32 (vecreduce_or(v16i8 V128:$vec))), (i32 255)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
+def : Pat<(i32 (setcc (and (i32 (vecreduce_or(v8i16 V128:$vec))), (i32 65535)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
+def : Pat<(i32 (setcc (vecreduce_or(v4i32 V128:$vec)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
+def : Pat<(i32 (setcc (vecreduce_or(v2i64 V128:$vec)), (i64 0), SETNE)), (ANYTRUE V128:$vec)>;
+
 multiclass SIMDBitmask<Vec vec, bits<32> simdop> {
   defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
                       [(set I32:$dst,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index 08fb7586d215e..efba2f8c8f805 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -327,3 +327,19 @@ bool WebAssemblyTTIImpl::isProfitableToSinkOperands(
 
   return false;
 }
+
+bool WebAssemblyTTIImpl::shouldExpandReduction(const IntrinsicInst *II) const {
+  // Always expand on Subtargets without vector instructions.
+  if (!ST->hasSIMD128())
+    return true;
+
+  // Whether or not to expand is a per-intrinsic decision.
+  switch (II->getIntrinsicID()) {
+  default:
+    return true;
+  case Intrinsic::vector_reduce_and:
+    return false;
+  case Intrinsic::vector_reduce_or:
+    return false;
+  }
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index c915eeb07d4fd..996b5e45daad1 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -100,6 +100,7 @@ class WebAssemblyTTIImpl final : public BasicTTIImplBase<WebAssemblyTTIImpl> {
   bool isProfitableToSinkOperands(Instruction *I,
                                   SmallVectorImpl<Use *> &Ops) const override;
 
+  bool shouldExpandReduction(const IntrinsicInst *II) const override;
   /// @}
 };
 
diff --git a/llvm/test/CodeGen/WebAssembly/any-all-true.ll b/llvm/test/CodeGen/WebAssembly/any-all-true.ll
new file mode 100644
index 0000000000000..0db5b90ebd053
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/any-all-true.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+
+; RUN: llc < %s -verify-machineinstrs -mattr=+simd128 | FileCheck %s
+
+target triple = "wasm32-unknown-unknown"
+
+declare i8 @llvm.vector.reduce.and.v16i8(<16 x i8>)
+declare i8 @llvm.vector.reduce.or.v16i8(<16 x i8>)
+declare i16 @llvm.vector.reduce.and.v8i16(<8 x i16>)
+declare i16 @llvm.vector.reduce.or.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.and.v4i32(<4 x i32>)
+declare i32 @llvm.vector.reduce.or.v4i32(<4 x i32>)
+declare i64 @llvm.vector.reduce.and.v2i64(<2 x i64>)
+declare i64 @llvm.vector.reduce.or.v2i64(<2 x i64>)
+
+define zeroext i1 @manual_i8x16_all_true(<4 x i32> %a) {
+; CHECK-LABEL: manual_i8x16_all_true:
+; CHECK:         .functype manual_i8x16_all_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.all_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %_3 = bitcast <4 x i32> %a to <16 x i8>
+  %0 = tail call i8 @llvm.vector.reduce.and.v16i8(<16 x i8> %_3)
+  %_0 = icmp eq i8 %0, -1
+  ret i1 %_0
+}
+
+define zeroext i1 @manual_i16x8_all_true(<4 x i32> %a) {
+; CHECK-LABEL: manual_i16x8_all_true:
+; CHECK:         .functype manual_i16x8_all_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i16x8.all_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %_3 = bitcast <4 x i32> %a to <8 x i16>
+  %0 = tail call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> %_3)
+  %_0 = icmp eq i16 %0, -1
+  ret i1 %_0
+}
+
+define zeroext i1 @manual_i32x4_all_true(<4 x i32> %a) {
+; CHECK-LABEL: manual_i32x4_all_true:
+; CHECK:         .functype manual_i32x4_all_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i32x4.all_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %0 = tail call i32 @llvm.vector.reduce.and.v4i32(<4 x i32> %a)
+  %_0 = icmp eq i32 %0, -1
+  ret i1 %_0
+}
+
+define zeroext i1 @manual_i64x2_all_true(<2 x i64> %a) {
+; CHECK-LABEL: manual_i64x2_all_true:
+; CHECK:         .functype manual_i64x2_all_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i64x2.all_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %0 = tail call i64 @llvm.vector.reduce.and.v2i64(<2 x i64> %a)
+  %_0 = icmp eq i64 %0, -1
+  ret i1 %_0
+}
+
+; ---
+
+define zeroext i1 @manual_i8x16_any_true(<4 x i32> %a) {
+; CHECK-LABEL: manual_i8x16_any_true:
+; CHECK:         .functype manual_i8x16_any_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.any_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %_3 = bitcast <4 x i32> %a to <16 x i8>
+  %0 = tail call i8 @llvm.vector.reduce.or.v16x8(<16 x i8> %_3)
+  %_0 = icmp ne i8 %0, 0
+  ret i1 %_0
+}
+
+define i1 @i16x8_any_true(<4 x i32> %a) {
+; CHECK-LABEL: i16x8_any_true:
+; CHECK:         .functype i16x8_any_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.any_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %_3 = bitcast <4 x i32> %a to <8 x i16>
+  %0 = tail call i16 @llvm.vector.reduce.or.v8i16(<8 x i16> %_3)
+  %_0 = icmp ne i16 %0, 0
+  ret i1 %_0
+}
+
+define i1 @manual_i32x4_any_true(<4 x i32> %a) {
+; CHECK-LABEL: manual_i32x4_any_true:
+; CHECK:         .functype manual_i32x4_any_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.any_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %0 = tail call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> %a)
+  %_0 = icmp ne i32 %0, 0
+  ret i1 %_0
+}
+
+
+define zeroext i1 @manual_i64x2_any_true(<2 x i64> %a) {
+; CHECK-LABEL: manual_i64x2_any_true:
+; CHECK:         .functype manual_i64x2_any_true (v128) -> (i32)
+; CHECK-NEXT:  # %bb.0: # %start
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    v128.any_true
+; CHECK-NEXT:    # fallthrough-return
+start:
+  %0 = tail call i64 @llvm.vector.reduce.or.v2i64(<2 x i64> %a)
+  %_0 = icmp ne i64 %0, 0
+  ret i1 %_0
+}

From 49897d4ecdc9db3fbaf175416c3ab56046dfa09e Mon Sep 17 00:00:00 2001
From: Folkert de Vries <folkert@folkertdev.nl>
Date: Sat, 30 Aug 2025 21:02:27 +0200
Subject: [PATCH 2/2] wasm: explicitly combine setcc and vecreduce

---
 .../WebAssembly/WebAssemblyISelLowering.cpp   | 84 ++++++++++++++++---
 .../WebAssembly/WebAssemblyInstrSIMD.td       | 10 ---
 llvm/test/CodeGen/WebAssembly/any-all-true.ll |  2 +-
 3 files changed, 74 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6e3aab4094459..db292ab32f8b2 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -281,16 +281,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     setOperationAction(ISD::CTLZ, MVT::v16i8, Expand);
     setOperationAction(ISD::CTTZ, MVT::v16i8, Expand);
 
-    setOperationAction(ISD::VECREDUCE_AND, MVT::v16i8, Legal);
-    setOperationAction(ISD::VECREDUCE_AND, MVT::v8i16, Legal);
-    setOperationAction(ISD::VECREDUCE_AND, MVT::v4i32, Legal);
-    setOperationAction(ISD::VECREDUCE_AND, MVT::v2i64, Legal);
-
-    setOperationAction(ISD::VECREDUCE_OR, MVT::v16i8, Legal);
-    setOperationAction(ISD::VECREDUCE_OR, MVT::v8i16, Legal);
-    setOperationAction(ISD::VECREDUCE_OR, MVT::v4i32, Legal);
-    setOperationAction(ISD::VECREDUCE_OR, MVT::v2i64, Legal);
-
     // Custom lower bit counting operations for other types to scalarize them.
     for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP})
       for (auto T : {MVT::v8i16, MVT::v4i32, MVT::v2i64})
@@ -3396,6 +3386,75 @@ static SDValue TryMatchTrue(SDNode *N, EVT VecVT, SelectionDAG &DAG) {
   return DAG.getZExtOrTrunc(Ret, DL, N->getValueType(0));
 }
 
+// Combine a setcc of a vecreduce, for example:
+//
+// setcc (vecreduce_or(v4i32 V128:$vec)), (i32 0), SETNE
+//  ==> ANYTRUE V128:$vec
+//
+// setcc (i32 (vecreduce_and(v4i32 V128:$vec))), (i32 -1), SETEQ
+//  ==> ALLTRUE_I32x4 V128:$vec
+static SDValue combineSetCCVecReduce(SDNode *SetCC,
+                                     TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Reduce = SetCC->getOperand(0);
+  SDValue Constant = SetCC->getOperand(1);
+  SDValue Cond = SetCC->getOperand(2);
+  unsigned ReduceIntrinsic;
+
+  // i8 and i16 truncate the vecreduce result.
+  if (Reduce->getOpcode() == ISD::AND) {
+    SDValue L = Reduce->getOperand(0), R = Reduce->getOperand(1);
+
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(R);
+    if (!C)
+      return SDValue();
+
+    EVT VT = Reduce->getValueType(0);
+    if (VT == MVT::v16i8 && C->getZExtValue() == 255) {
+      Reduce = L;
+    } else if (VT == MVT::v8i16 && C->getZExtValue() == 65535) {
+      Reduce = L;
+    } else {
+      return SDValue();
+    }
+  }
+
+  switch (Reduce->getOpcode()) {
+  case ISD::VECREDUCE_OR: {
+    ReduceIntrinsic = Intrinsic::wasm_anytrue;
+
+    if (cast<CondCodeSDNode>(Cond)->get() != ISD::SETNE)
+      return SDValue();
+
+    if (cast<ConstantSDNode>(Constant)->getSExtValue() != 0)
+      return SDValue();
+
+    break;
+  }
+  case ISD::VECREDUCE_AND: {
+    ReduceIntrinsic = Intrinsic::wasm_alltrue;
+
+    if (cast<CondCodeSDNode>(Cond)->get() != ISD::SETEQ)
+      return SDValue();
+
+    if (cast<ConstantSDNode>(Constant)->getSExtValue() != -1)
+      return SDValue();
+
+    break;
+  }
+  default:
+    return SDValue();
+  }
+
+  SDLoc DL(SetCC);
+  auto &DAG = DCI.DAG;
+  SDValue Match = Reduce->getOperand(0);
+
+  return DAG.getZExtOrTrunc(
+      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+                  {DAG.getConstant(ReduceIntrinsic, DL, MVT::i32), Match}),
+      DL, MVT::i1);
+}
+
 /// Try to convert a i128 comparison to a v16i8 comparison before type
 /// legalization splits it up into chunks
 static SDValue
@@ -3456,6 +3515,9 @@ static SDValue performSETCCCombine(SDNode *N,
   if (SDValue V = combineVectorSizedSetCCEquality(N, DCI, Subtarget))
     return V;
 
+  if (SDValue V = combineSetCCVecReduce(N, DCI))
+    return V;
+
   SDValue LHS = N->getOperand(0);
   if (LHS->getOpcode() != ISD::BITCAST)
     return SDValue();
@@ -3470,9 +3532,9 @@ static SDValue performSETCCCombine(SDNode *N,
 
   if (!cast<ConstantSDNode>(N->getOperand(1)))
     return SDValue();
-
   EVT VecVT = FromVT.changeVectorElementType(MVT::getIntegerVT(128 / NumElts));
   auto &DAG = DCI.DAG;
+
   // setcc (iN (bitcast (vNi1 X))), 0, ne
   //   ==> any_true (vNi1 X)
   if (auto Match = TryMatchTrue<0, ISD::SETNE, false, Intrinsic::wasm_anytrue>(
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index d129313115032..143298b700928 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -997,16 +997,6 @@ def : Pat<(i32 (setne (i32 (intrinsic (vec.vt V128:$x))), (i32 0))), (inst $x)>;
 def : Pat<(i32 (seteq (i32 (intrinsic (vec.vt V128:$x))), (i32 1))), (inst $x)>;
 }
 
-def : Pat<(i32 (setcc (and (i32 (vecreduce_and (v16i8 V128:$vec))), (i32 255)), (i32 255), SETEQ)), (ALLTRUE_I8x16 V128:$vec)>;
-def : Pat<(i32 (setcc (and (i32 (vecreduce_and (v8i16 V128:$vec))), (i32 65535)), (i32 65535), SETEQ)), (ALLTRUE_I16x8 V128:$vec)>;
-def : Pat<(i32 (setcc (i32 (vecreduce_and(v4i32 V128:$vec))), (i32 -1), SETEQ)), (ALLTRUE_I32x4 V128:$vec)>;
-def : Pat<(i32 (setcc (i64 (vecreduce_and(v2i64 V128:$vec))), (i64 -1), SETEQ)), (ALLTRUE_I64x2 V128:$vec)>;
-
-def : Pat<(i32 (setcc (and (i32 (vecreduce_or(v16i8 V128:$vec))), (i32 255)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
-def : Pat<(i32 (setcc (and (i32 (vecreduce_or(v8i16 V128:$vec))), (i32 65535)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
-def : Pat<(i32 (setcc (vecreduce_or(v4i32 V128:$vec)), (i32 0), SETNE)), (ANYTRUE V128:$vec)>;
-def : Pat<(i32 (setcc (vecreduce_or(v2i64 V128:$vec)), (i64 0), SETNE)), (ANYTRUE V128:$vec)>;
-
 multiclass SIMDBitmask<Vec vec, bits<32> simdop> {
   defm _#vec : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
                       [(set I32:$dst,
diff --git a/llvm/test/CodeGen/WebAssembly/any-all-true.ll b/llvm/test/CodeGen/WebAssembly/any-all-true.ll
index 0db5b90ebd053..b6fd0cde83bec 100644
--- a/llvm/test/CodeGen/WebAssembly/any-all-true.ll
+++ b/llvm/test/CodeGen/WebAssembly/any-all-true.ll
@@ -111,7 +111,7 @@ start:
 }
 
 
-define zeroext i1 @manual_i64x2_any_true(<2 x i64> %a) {
+define i1 @manual_i64x2_any_true(<2 x i64> %a) {
 ; CHECK-LABEL: manual_i64x2_any_true:
 ; CHECK:         .functype manual_i64x2_any_true (v128) -> (i32)
 ; CHECK-NEXT:  # %bb.0: # %start