From edf26bba9d84dbef40daf2bc4514335a7519333a Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Tue, 14 Oct 2025 18:09:55 +0200
Subject: [PATCH 01/39] 8340093

---
 src/hotspot/share/opto/vtransform.hpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index 7ad7b432e9b43..bfd124a6ed772 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -51,6 +51,10 @@
 //   - Compute linearization of the VTransformGraph, into an order that respects
 //     all edges in the graph (bailout if cycle detected).
 //
+// - Cost-Model:
+//   - We use a cost-model as a heuristic to determine if vectorization is profitable.
+//     Compute the cost of the loop with and without vectorization.
+//
 // - Apply:
 //   - Changes to the C2 IR are only made once the "apply" method is called.
 //   - Align the main loop, by adjusting pre loop limit.

From ce4ce1f0fd50cbe0f015132b8270f4d173803121 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 15 Oct 2025 09:08:46 +0200
Subject: [PATCH 02/39] add cost to matcher

---
 src/hotspot/share/opto/matcher.cpp | 27 +++++++++++++++++++++++++++
 src/hotspot/share/opto/matcher.hpp |  5 +++++
 2 files changed, 32 insertions(+)

diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp
index 7d73487cf8840..49cd1a4051561 100644
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@@ -2677,6 +2677,33 @@ void Matcher::specialize_generic_vector_operands() {
   }
 }
 
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float Matcher::cost_for_scalar(int opcode) {
+  return 1;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float Matcher::cost_for_vector(int opcode, int vlen, BasicType bt) {
+  return 1;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float Matcher::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) {
+  if (requires_strict_order) {
+    // Linear: shuffle and reduce
+    return 2 * vlen;
+  } else {
+    // Recursive: shuffle and reduce
+    return 2 * exact_log2(vlen);
+  }
+}
+
 uint Matcher::vector_length(const Node* n) {
   const TypeVect* vt = n->bottom_type()->is_vect();
   return vt->length();
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index 0b609b70ab5e9..7ab070bec4ab5 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -333,6 +333,11 @@ class Matcher : public PhaseTransform {
 
   static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
 
+  // Cost-Model for Auto-Vectorization
+  static float cost_for_scalar(int opcode);
+  static float cost_for_vector(int opcode, int vlen, BasicType bt);
+  static float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order);
+
   static const RegMask* predicate_reg_mask(void);
 
   // Vector width in bytes

From 8ac7d0ae12c16f7e894acbb3dd49774cdd88ead8 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 20 Oct 2025 15:59:50 +0200
Subject: [PATCH 03/39] rm old reduction heuristic

---
 src/hotspot/share/opto/superword.cpp | 67 +++-------------------------
 src/hotspot/share/opto/superword.hpp |  2 -
 2 files changed, 7 insertions(+), 62 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index c0f005048ec66..11577af656eba 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
            ),
   _vpointer_for_main_loop_alignment(nullptr),
   _aw_for_main_loop_alignment(0),
-  _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
-  _num_work_vecs(0),                                        // amount of vector work we have
-  _num_reductions(0)                                        // amount of reduction work we have
+  _do_vector_loop(phase()->C->do_vector_loop())             // whether to do vectorization/simd style
 {
 }
 
@@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
 
 // Remove packs that are not profitable.
 void SuperWord::filter_packs_for_profitable() {
-  // Count the number of reductions vs other vector ops, for the
-  // reduction profitability heuristic.
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* pack = _packset.at(i);
-    Node* n = pack->at(0);
-    if (is_marked_reduction(n)) {
-      _num_reductions++;
-    } else {
-      _num_work_vecs++;
-    }
-  }
-
   // Remove packs that are not profitable
   auto filter = [&](const Node_List* pack) {
     return profitable(pack);
@@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
   if (p0 != nullptr) {
     int opc = p0->Opcode();
     if (is_marked_reduction(p0)) {
-      const Type *arith_type = p0->bottom_type();
-      // This heuristic predicts that 2-element reductions for INT/LONG are not
-      // profitable. This heuristic was added in JDK-8078563. The argument
-      // was that reductions are not just a single instruction, but multiple, and
-      // hence it is not directly clear that they are profitable. If we only have
-      // two elements per vector, then the performance gains from non-reduction
-      // vectors are at most going from 2 scalar instructions to 1 vector instruction.
-      // But a 2-element reduction vector goes from 2 scalar instructions to
-      // 3 instructions (1 shuffle and two reduction ops).
-      // However, this optimization assumes that these reductions stay in the loop
-      // which may not be true any more in most cases after the introduction of:
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-      bool is_two_element_int_or_long_reduction = (size == 2) &&
-                                                  (arith_type->basic_type() == T_INT ||
-                                                   arith_type->basic_type() == T_LONG);
-      if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-        return false;
-      }
+      const Type* arith_type = p0->bottom_type();
       retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
     } else if (VectorNode::is_convert_opcode(opc)) {
       retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
@@ -1772,26 +1734,6 @@ bool SuperWord::profitable(const Node_List* p) const {
       // The second input has to be the vector we wanted to reduce,
       // but it was not packed.
       return false;
-    } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
-      // This heuristic predicts that the reduction is not profitable.
-      // Reduction vectors can be expensive, because they require multiple
-      // operations to fold all the lanes together. Hence, vectorizing the
-      // reduction is not profitable on its own. Hence, we need a lot of
-      // other "work vectors" that deliver performance improvements to
-      // balance out the performance loss due to reductions.
-      // This heuristic is a bit simplistic, and assumes that the reduction
-      // vector stays in the loop. But in some cases, we can move the
-      // reduction out of the loop, replacing it with a single vector op.
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
-          tty->print_cr("  reduction profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-      return false;
     } else if (second_pk->size() != p->size()) {
       return false;
     }
@@ -1950,6 +1892,9 @@ bool SuperWord::do_vtransform() const {
   vtransform.optimize();
 
   if (!vtransform.schedule()) { return false; }
+
+  // TODO: use AutoVectorizationOverrideProfitability
+  //       Maybe order it after the general bailout?
   if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
 
   if (AutoVectorizationOverrideProfitability == 0) {
@@ -1961,6 +1906,8 @@ bool SuperWord::do_vtransform() const {
     return false;
   }
 
+  // TODO: check cost
+
   vtransform.apply();
   return true;
 }
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 118e0aa042c79..9654465220b9c 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {
 
  private:
   bool           _do_vector_loop;  // whether to do vectorization/simd style
-  int            _num_work_vecs;   // Number of non memory vector operations
-  int            _num_reductions;  // Number of reduction expressions applied
 
   // Accessors
   Arena* arena()                   { return &_arena; }

From 57e69dfea5c8cac2dd6f289a89e59abea6d152dd Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 20 Oct 2025 16:14:42 +0200
Subject: [PATCH 04/39] refactor with is_profitable

---
 src/hotspot/share/opto/superword.cpp  | 25 ++++++++++++++++++++-----
 src/hotspot/share/opto/vtransform.hpp |  1 +
 2 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 11577af656eba..50e4ebece3072 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1893,22 +1893,37 @@ bool SuperWord::do_vtransform() const {
 
   if (!vtransform.schedule()) { return false; }
 
-  // TODO: use AutoVectorizationOverrideProfitability
-  //       Maybe order it after the general bailout?
-  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
+  if (!vtransform.is_profitable()) { return false; }
+
+  vtransform.apply();
+  return true;
+}
+
+bool VTransform::is_profitable() const {
+  assert(_graph.is_scheduled(), "must already be scheduled");
 
   if (AutoVectorizationOverrideProfitability == 0) {
 #ifndef PRODUCT
-    if (is_trace_superword_any()) {
+    if (_trace._info) {
       tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
     }
 #endif
     return false;
   }
 
+  if (AutoVectorizationOverrideProfitability == 2) {
+#ifndef PRODUCT
+    if (_trace._info) {
+      tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
+    }
+#endif
+    return true;
+  }
+
+  if (has_store_to_load_forwarding_failure()) { return false; }
+
   // TODO: check cost
 
-  vtransform.apply();
   return true;
 }
 
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index bfd124a6ed772..b0902cab5ca9f 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -256,6 +256,7 @@ class VTransform : public StackObj {
 
   void optimize() { return _graph.optimize(*this); }
   bool schedule() { return _graph.schedule(); }
+  bool is_profitable() const;
   bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();
 

From 30d916f842f651fac5466116979da608b7309932 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 20 Oct 2025 16:32:16 +0200
Subject: [PATCH 05/39] code transfer wip

---
 src/hotspot/share/opto/superword.cpp     | 19 ++++-
 src/hotspot/share/opto/vectorization.cpp | 87 +++++++++++++++++++++++
 src/hotspot/share/opto/vectorization.hpp | 11 +++
 src/hotspot/share/opto/vtransform.cpp    | 89 ++++++++++++++++++++++++
 src/hotspot/share/opto/vtransform.hpp    |  2 +
 5 files changed, 205 insertions(+), 3 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 50e4ebece3072..f8d247bf6dd32 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1899,6 +1899,7 @@ bool SuperWord::do_vtransform() const {
   return true;
 }
 
+// TODO: move to other file
 bool VTransform::is_profitable() const {
   assert(_graph.is_scheduled(), "must already be scheduled");
 
@@ -1920,11 +1921,23 @@ bool VTransform::is_profitable() const {
     return true;
   }
 
+  // Note: currently we only do throughput-based cost-modeling. In the future, we could
+  //       also implement latency-based cost-modeling and take store-to-load-forwarding
+  //       failures into account as the latency between the load and store. This would
+  //       allow a more precise tradeoff between the forwarding failure penalty versus
+  //       the vectorization gains.
   if (has_store_to_load_forwarding_failure()) { return false; }
 
-  // TODO: check cost
-
-  return true;
+  // Cost-model
+  float scalar_cost = _vloop_analyzer.cost();
+  float vector_cost = cost();
+#ifndef PRODUCT
+  if (_trace._info) {
+    tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
+                  scalar_cost, vector_cost);
+  }
+#endif
+  return vector_cost < scalar_cost;
 }
 
 // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 5c4e15fdbb916..84e81b214c9b7 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -541,6 +541,93 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
+//bool VLoopAnalyzer::has_zero_cost(Node* n) const {
+//  // Outside body?
+//  if (!_vloop.in_bb(n)) { return true; }
+//
+//  // Internal nodes of pointer expressions are most likely folded into
+//  // the load / store and have no additional cost.
+//  if (vpointers().is_in_pointer_expression(n)) { return true; }
+//
+//  if (n->is_AddP() || // Pointer expression
+//      n->is_CFG() ||  // CFG
+//      n->is_Phi() ||  // CFG
+//      n->is_Cmp() ||  // CFG
+//      n->is_Bool()) { // CFG
+//    return true;
+//  }
+//
+//  // All other nodes have a non-zero cost.
+//  return false;
+//}
+
+// Compute the cost over all operations in the (scalar) loop.
+float VLoopAnalyzer::cost() const {
+  return 0;
+}
+
+// TODO: impl
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("\nVLoopAnalyzer::cost:");
+//  }
+//#endif
+//
+//  float sum = 0;
+//  for (int j = 0; j < body().body().length(); j++) {
+//    Node* n = body().body().at(j);
+//    if (!has_zero_cost(n)) {
+//      float c = cost_for_scalar(n->Opcode());
+//      sum += c;
+//#ifndef PRODUCT
+//      if (_vloop.is_trace_cost_verbose()) {
+//        tty->print_cr("  -> cost = %.2f for %d %s", c, n->_idx, n->Name());
+//      }
+//#endif
+//    }
+//  }
+//
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("  total_cost = %.2f", sum);
+//  }
+//#endif
+//  return sum;
+//}
+//
+//float VLoopAnalyzer::cost_for_scalar(int opcode) const {
+//  float c = Matcher::cost_for_scalar(opcode);
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
+//  }
+//#endif
+//  return c;
+//}
+//
+//float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
+//  float c = Matcher::cost_for_vector(opcode, vlen, bt);
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
+//                  c, NodeClassNames[opcode], vlen, type2name(bt));
+//  }
+//#endif
+//  return c;
+//}
+//
+//float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
+//  float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order);
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
+//                  c, NodeClassNames[opcode], vlen, type2name(bt),
+//                  requires_strict_order ? "true" : "false");
+//  }
+//#endif
+//  return c;
+//}
+
 // Computing aliasing runtime check using init and last of main-loop
 // -----------------------------------------------------------------
 //
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index b1be52d531a51..aa72980db83bb 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -810,6 +810,17 @@ class VLoopAnalyzer : StackObj {
   const VLoopVPointers& vpointers()              const { return _vpointers; }
   const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
 
+  // Compute the cost of the (scalar) body.
+  float cost() const;
+  // TODO: impl
+  // bool has_zero_cost(Node* n) const;
+
+  // TODO: impl
+  // // Cost-modeling with tracing.
+  // float cost_for_scalar(int opcode) const;
+  // float cost_for_vector(int opcode, int vlen, BasicType bt) const;
+  // float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
+
 private:
   bool setup_submodules();
   VStatus setup_submodules_helper();
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 46e8f43cb657d..eb982e3ccbe9f 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -186,6 +186,95 @@ int VTransformGraph::count_alive_vtnodes() const {
   return count;
 }
 
+// Find all nodes that in the loop, in a 2-phase process:
+// - First, find all nodes that are not before the loop:
+//   - loop-phis
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+// - Second, we find all nodes that are not after the loop:
+//   - backedges
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+//void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
+//  assert(is_scheduled(), "must already be scheduled");
+//
+//  // Phase 1: find all nodes that are not before the loop.
+//  VectorSet is_not_before_loop;
+//  for (int i = 0; i < _schedule.length(); i++) {
+//    VTransformNode* vtn = _schedule.at(i);
+//    // Is vtn a loop-phi?
+//    if (vtn->isa_LoopPhi() != nullptr ||
+//        vtn->is_load_or_store_in_loop()) {
+//      is_not_before_loop.set(vtn->_idx);
+//      continue;
+//    }
+//    // Or one of its transitive uses?
+//    for (uint j = 0; j < vtn->req(); j++) {
+//      VTransformNode* def = vtn->in(j);
+//      if (def != nullptr && is_not_before_loop.test(def->_idx)) {
+//        is_not_before_loop.set(vtn->_idx);
+//        break;
+//      }
+//    }
+//  }
+//
+//  // Phase 2: find all nodes that are not after the loop.
+//  for (int i = _schedule.length()-1; i >= 0; i--) {
+//    VTransformNode* vtn = _schedule.at(i);
+//    if (!is_not_before_loop.test(vtn->_idx)) { continue; }
+//    // Is load or store?
+//    if (vtn->is_load_or_store_in_loop()) {
+//        in_loop.set(vtn->_idx);
+//        continue;
+//    }
+//    for (int i = 0; i < vtn->outs(); i++) {
+//      VTransformNode* use = vtn->out(i);
+//      // Or is vtn a backedge or one of its transitive defs?
+//      if (in_loop.test(use->_idx) ||
+//          use->isa_LoopPhi() != nullptr) {
+//        in_loop.set(vtn->_idx);
+//        break;
+//      }
+//    }
+//  }
+//}
+
+float VTransformGraph::cost() const {
+  assert(is_scheduled(), "must already be scheduled");
+  return 1;
+}
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("\nVTransformGraph::cost:");
+//  }
+//#endif
+//
+//  ResourceMark rm;
+//  VectorSet in_loop;
+//  mark_vtnodes_in_loop(in_loop);
+//
+//  float sum = 0;
+//  for (int i = 0; i < _schedule.length(); i++) {
+//    VTransformNode* vtn = _schedule.at(i);
+//    if (!in_loop.test(vtn->_idx)) { continue; }
+//    float c = vtn->cost(_vloop_analyzer);
+//    sum += c;
+//#ifndef PRODUCT
+//    if (c != 0 && _vloop.is_trace_cost_verbose()) {
+//      tty->print("  -> cost = %.2f for ", c);
+//      vtn->print();
+//    }
+//#endif
+//  }
+//
+//#ifndef PRODUCT
+//  if (_vloop.is_trace_cost()) {
+//    tty->print_cr("  total_cost = %.2f", sum);
+//  }
+//#endif
+//  return sum;
+//}
+
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
                                            const VectorSet& pre_visited,
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index b0902cab5ca9f..60efaa544b206 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -194,6 +194,7 @@ class VTransformGraph : public StackObj {
   void optimize(VTransform& vtransform);
   bool schedule();
   bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
+  float cost() const;
   void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
 
 private:
@@ -257,6 +258,7 @@ class VTransform : public StackObj {
   void optimize() { return _graph.optimize(*this); }
   bool schedule() { return _graph.schedule(); }
   bool is_profitable() const;
+  float cost() const { return _graph.cost(); }
   bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();
 

From da3b4b3134a4e204237660465ae8b6800577eda7 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 20 Oct 2025 16:46:29 +0200
Subject: [PATCH 06/39] wip apply code

---
 .../share/opto/traceAutoVectorizationTag.hpp  |   2 +
 src/hotspot/share/opto/vectorization.hpp      |   8 ++
 src/hotspot/share/opto/vtransform.cpp         | 116 +++++++++---------
 src/hotspot/share/opto/vtransform.hpp         |   1 +
 4 files changed, 70 insertions(+), 57 deletions(-)

diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index d996173aeb43b..aac3d09f44995 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -47,6 +47,8 @@
   flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
   flags(VTRANSFORM,                 "Trace VTransform Graph") \
   flags(OPTIMIZATION,               "Trace VTransform::optimize") \
+  flags(COST,                       "Trace cost of VLoop (scalar) and VTransform (vector)") \
+  flags(COST_VERBOSE,               "Trace like COST, but more verbose") \
   flags(ALIGN_VECTOR,               "Trace AlignVector") \
   flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
   flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index aa72980db83bb..25f5b99b904f5 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -209,6 +209,14 @@ class VLoop : public StackObj {
     return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
   }
 
+  bool is_trace_cost() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST);
+  }
+
+  bool is_trace_cost_verbose() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE);
+  }
+
   bool is_trace_speculative_runtime_checks() const {
     return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
   }
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index eb982e3ccbe9f..46db23c5e52e9 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -195,65 +195,67 @@ int VTransformGraph::count_alive_vtnodes() const {
 //   - backedges
 //   - loads and stores that are in the loop
 //   - and all their transitive uses.
-//void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
-//  assert(is_scheduled(), "must already be scheduled");
 //
-//  // Phase 1: find all nodes that are not before the loop.
-//  VectorSet is_not_before_loop;
-//  for (int i = 0; i < _schedule.length(); i++) {
-//    VTransformNode* vtn = _schedule.at(i);
-//    // Is vtn a loop-phi?
-//    if (vtn->isa_LoopPhi() != nullptr ||
-//        vtn->is_load_or_store_in_loop()) {
-//      is_not_before_loop.set(vtn->_idx);
-//      continue;
-//    }
-//    // Or one of its transitive uses?
-//    for (uint j = 0; j < vtn->req(); j++) {
-//      VTransformNode* def = vtn->in(j);
-//      if (def != nullptr && is_not_before_loop.test(def->_idx)) {
-//        is_not_before_loop.set(vtn->_idx);
-//        break;
-//      }
-//    }
-//  }
-//
-//  // Phase 2: find all nodes that are not after the loop.
-//  for (int i = _schedule.length()-1; i >= 0; i--) {
-//    VTransformNode* vtn = _schedule.at(i);
-//    if (!is_not_before_loop.test(vtn->_idx)) { continue; }
-//    // Is load or store?
-//    if (vtn->is_load_or_store_in_loop()) {
-//        in_loop.set(vtn->_idx);
-//        continue;
-//    }
-//    for (int i = 0; i < vtn->outs(); i++) {
-//      VTransformNode* use = vtn->out(i);
-//      // Or is vtn a backedge or one of its transitive defs?
-//      if (in_loop.test(use->_idx) ||
-//          use->isa_LoopPhi() != nullptr) {
-//        in_loop.set(vtn->_idx);
-//        break;
-//      }
-//    }
-//  }
-//}
+// in_loop: vtn->_idx -> bool
+void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
+  assert(is_scheduled(), "must already be scheduled");
+
+  // Phase 1: find all nodes that are not before the loop.
+  VectorSet is_not_before_loop;
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    // Is vtn a loop-phi?
+    if (vtn->isa_LoopPhi() != nullptr ||
+        // TODO: what about VTransformCountedLoopNode?
+        vtn->is_load_or_store_in_loop()) {
+      is_not_before_loop.set(vtn->_idx);
+      continue;
+    }
+    // Or one of its transitive uses?
+    for (uint j = 0; j < vtn->req(); j++) {
+      VTransformNode* def = vtn->in_req(j);
+      if (def != nullptr && is_not_before_loop.test(def->_idx)) {
+        is_not_before_loop.set(vtn->_idx);
+        break;
+      }
+    }
+  }
+
+  // Phase 2: find all nodes that are not after the loop.
+  for (int i = _schedule.length()-1; i >= 0; i--) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!is_not_before_loop.test(vtn->_idx)) { continue; }
+    // Is load or store?
+    if (vtn->is_load_or_store_in_loop()) {
+        in_loop.set(vtn->_idx);
+        continue;
+    }
+    for (uint i = 0; i < vtn->out_strong_edges(); i++) {
+      VTransformNode* use = vtn->out_strong_edge(i);
+      // Or is vtn a backedge or one of its transitive defs?
+      if (in_loop.test(use->_idx) ||
+          use->isa_LoopPhi() != nullptr) {
+        in_loop.set(vtn->_idx);
+        break;
+      }
+    }
+    // TODO: what about CFG nodes?
+  }
+}
 
 float VTransformGraph::cost() const {
   assert(is_scheduled(), "must already be scheduled");
-  return 1;
-}
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("\nVTransformGraph::cost:");
-//  }
-//#endif
-//
-//  ResourceMark rm;
-//  VectorSet in_loop;
-//  mark_vtnodes_in_loop(in_loop);
-//
-//  float sum = 0;
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVTransformGraph::cost:");
+  }
+#endif
+
+  ResourceMark rm;
+  VectorSet in_loop; // vtn->_idx -> bool
+  mark_vtnodes_in_loop(in_loop);
+
+  float sum = 0;
 //  for (int i = 0; i < _schedule.length(); i++) {
 //    VTransformNode* vtn = _schedule.at(i);
 //    if (!in_loop.test(vtn->_idx)) { continue; }
@@ -272,8 +274,8 @@ float VTransformGraph::cost() const {
 //    tty->print_cr("  total_cost = %.2f", sum);
 //  }
 //#endif
-//  return sum;
-//}
+  return sum;
+}
 
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index 60efaa544b206..4c6d4fd70f9ad 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -205,6 +205,7 @@ class VTransformGraph : public StackObj {
 
   void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
   int count_alive_vtnodes() const;
+  void mark_vtnodes_in_loop(VectorSet& in_loop) const;
 
 #ifndef PRODUCT
   void print_vtnodes() const;

From b7b5ac00a7f08769a686c249034a3478653dada3 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 20 Oct 2025 17:25:20 +0200
Subject: [PATCH 07/39] wip impl cost for vtn

---
 src/hotspot/share/opto/vectorization.cpp | 106 ++++++++++++-----------
 src/hotspot/share/opto/vectorization.hpp |  14 ++-
 src/hotspot/share/opto/vtransform.cpp    | 103 ++++++++++++++++++----
 src/hotspot/share/opto/vtransform.hpp    |  19 ++++
 4 files changed, 164 insertions(+), 78 deletions(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 84e81b214c9b7..0f59f746f21d1 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -541,25 +541,27 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
-//bool VLoopAnalyzer::has_zero_cost(Node* n) const {
-//  // Outside body?
-//  if (!_vloop.in_bb(n)) { return true; }
-//
-//  // Internal nodes of pointer expressions are most likely folded into
-//  // the load / store and have no additional cost.
-//  if (vpointers().is_in_pointer_expression(n)) { return true; }
-//
-//  if (n->is_AddP() || // Pointer expression
-//      n->is_CFG() ||  // CFG
-//      n->is_Phi() ||  // CFG
-//      n->is_Cmp() ||  // CFG
-//      n->is_Bool()) { // CFG
-//    return true;
-//  }
-//
-//  // All other nodes have a non-zero cost.
-//  return false;
-//}
+bool VLoopAnalyzer::has_zero_cost(Node* n) const {
+  // Outside body?
+  if (!_vloop.in_bb(n)) { return true; }
+  // TODO: can we widen this to the loop, not just bb?
+
+  // Internal nodes of pointer expressions are most likely folded into
+  // the load / store and have no additional cost.
+  // TODO: implement
+  // if (vpointers().is_in_pointer_expression(n)) { return true; }
+
+  if (n->is_AddP() || // Pointer expression
+      n->is_CFG() ||  // CFG
+      n->is_Phi() ||  // CFG
+      n->is_Cmp() ||  // CFG
+      n->is_Bool()) { // CFG
+    return true;
+  }
+
+  // All other nodes have a non-zero cost.
+  return false;
+}
 
 // Compute the cost over all operations in the (scalar) loop.
 float VLoopAnalyzer::cost() const {
@@ -594,39 +596,39 @@ float VLoopAnalyzer::cost() const {
 //#endif
 //  return sum;
 //}
-//
-//float VLoopAnalyzer::cost_for_scalar(int opcode) const {
-//  float c = Matcher::cost_for_scalar(opcode);
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
-//  }
-//#endif
-//  return c;
-//}
-//
-//float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
-//  float c = Matcher::cost_for_vector(opcode, vlen, bt);
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
-//                  c, NodeClassNames[opcode], vlen, type2name(bt));
-//  }
-//#endif
-//  return c;
-//}
-//
-//float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
-//  float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order);
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
-//                  c, NodeClassNames[opcode], vlen, type2name(bt),
-//                  requires_strict_order ? "true" : "false");
-//  }
-//#endif
-//  return c;
-//}
+
+float VLoopAnalyzer::cost_for_scalar(int opcode) const {
+  float c = Matcher::cost_for_scalar(opcode);
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
+  }
+#endif
+  return c;
+}
+
+float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
+  float c = Matcher::cost_for_vector(opcode, vlen, bt);
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt));
+  }
+#endif
+  return c;
+}
+
+float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
+  float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order);
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt),
+                  requires_strict_order ? "true" : "false");
+  }
+#endif
+  return c;
+}
 
 // Computing aliasing runtime check using init and last of main-loop
 // -----------------------------------------------------------------
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 25f5b99b904f5..89f5778a45335 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -820,14 +820,12 @@ class VLoopAnalyzer : StackObj {
 
   // Compute the cost of the (scalar) body.
   float cost() const;
-  // TODO: impl
-  // bool has_zero_cost(Node* n) const;
-
-  // TODO: impl
-  // // Cost-modeling with tracing.
-  // float cost_for_scalar(int opcode) const;
-  // float cost_for_vector(int opcode, int vlen, BasicType bt) const;
-  // float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
+  bool has_zero_cost(Node* n) const;
+
+  // Cost-modeling with tracing.
+  float cost_for_scalar(int opcode) const;
+  float cost_for_vector(int opcode, int vlen, BasicType bt) const;
+  float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
 
 private:
   bool setup_submodules();
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 46db23c5e52e9..542bc194511b5 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -256,24 +256,24 @@ float VTransformGraph::cost() const {
   mark_vtnodes_in_loop(in_loop);
 
   float sum = 0;
-//  for (int i = 0; i < _schedule.length(); i++) {
-//    VTransformNode* vtn = _schedule.at(i);
-//    if (!in_loop.test(vtn->_idx)) { continue; }
-//    float c = vtn->cost(_vloop_analyzer);
-//    sum += c;
-//#ifndef PRODUCT
-//    if (c != 0 && _vloop.is_trace_cost_verbose()) {
-//      tty->print("  -> cost = %.2f for ", c);
-//      vtn->print();
-//    }
-//#endif
-//  }
-//
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("  total_cost = %.2f", sum);
-//  }
-//#endif
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!in_loop.test(vtn->_idx)) { continue; }
+    float c = vtn->cost(_vloop_analyzer);
+    sum += c;
+#ifndef PRODUCT
+    if (c != 0 && _vloop.is_trace_cost_verbose()) {
+      tty->print("  -> cost = %.2f for ", c);
+      vtn->print();
+    }
+#endif
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
   return sum;
 }
 
@@ -922,6 +922,10 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
   }
 }
 
+float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_scalar(_node->Opcode());
+}
+
 VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
   apply_vtn_inputs_to_node(_node, apply_state);
   // The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
@@ -934,6 +938,10 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
   return VTransformApplyResult::make_scalar(_node);
 }
 
+float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_scalar(_node->Opcode());
+}
+
 VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
   apply_vtn_inputs_to_node(_node, apply_state);
   return VTransformApplyResult::make_scalar(_node);
@@ -986,6 +994,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
   return VTransformApplyResult::make_scalar(_node);
 }
 
+float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector(Op_Replicate, _vlen, _element_type);
+}
+
 VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
   Node* val = apply_state.transformed_node(in_req(1));
   VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
@@ -993,6 +1005,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_scalar(Op_ConvI2L);
+}
+
 VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
   Node* val = apply_state.transformed_node(in_req(1));
   Node* n = new ConvI2LNode(val);
@@ -1000,6 +1016,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
   return VTransformApplyResult::make_scalar(n);
 }
 
+float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
+  return vloop_analyzer.cost_for_scalar(Op_AndI) +
+         vloop_analyzer.cost_for_vector(shift_count_opc, _vlen, _element_bt);
+}
+
 VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
   PhaseIdealLoop* phase = apply_state.phase();
   Node* shift_count_in = apply_state.transformed_node(in_req(1));
@@ -1015,6 +1037,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector(Op_PopulateIndex, _vlen, _element_bt);
+}
 
 VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
   PhaseIdealLoop* phase = apply_state.phase();
@@ -1027,6 +1052,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
   assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
   const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
@@ -1045,6 +1074,13 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // // TODO: implement, consider cast etc.
+  // return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()) +
+  //        vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), XXX);
+  return 2;
+}
+
 VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
   uint vlen = vector_length();
   int sopc  = scalar_opcode();
@@ -1060,6 +1096,12 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // TODO: implement
+  //return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type());
+  return 1;
+}
+
 VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
   const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
   const TypeVect* src_vt = TypeVect::make(_src_bt,              vector_length());
@@ -1072,6 +1114,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  assert(scalar_opcode() == Op_Bool, "");
+  return vloop_analyzer.cost_for_vector(Op_VectorMaskCmp, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
   const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
   assert(scalar_opcode() == Op_Bool, "");
@@ -1327,6 +1374,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
   return true; // success
 }
 
+float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  int vopc = vector_reduction_opcode();
+  bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
+  return vloop_analyzer.cost_for_vector_reduction(vopc, vlen, bt, requires_strict_order);
+}
+
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
   Node* init = apply_state.transformed_node(in_req(1));
   Node* vec  = apply_state.transformed_node(in_req(2));
@@ -1336,6 +1391,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
   return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }
 
+float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector(Op_LoadVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
   int sopc     = scalar_opcode();
   uint vlen    = vector_length();
@@ -1365,6 +1426,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
   return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }
 
+float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector(Op_StoreVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
   int sopc  = scalar_opcode();
   uint vlen = vector_length();
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index 4c6d4fd70f9ad..a887300806ce9 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -557,6 +557,8 @@ class VTransformNode : public ArenaObj {
 
   virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;
+
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
   virtual void apply_backedge(VTransformApplyState& apply_state) const {};
   void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@@ -587,6 +589,7 @@ class VTransformMemopScalarNode : public VTransformNode {
   virtual bool is_load_or_store_in_loop() const override { return true; }
 
   virtual const VPointer& vpointer() const override { return _vpointer; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -603,6 +606,7 @@ class VTransformDataScalarNode : public VTransformNode {
     assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name());
   }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -620,6 +624,7 @@ class VTransformLoopPhiNode : public VTransformNode {
   }
 
   virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   virtual void apply_backedge(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };)
@@ -637,6 +642,7 @@ class VTransformCFGNode : public VTransformNode {
     assert(_node->is_CFG(), "must be CFG node: %s", _node->Name());
   }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -663,6 +669,7 @@ class VTransformOuterNode : public VTransformNode {
     VTransformNode(vtransform, n->req()), _node(n) {}
 
   virtual VTransformOuterNode* isa_Outer() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -676,6 +683,7 @@ class VTransformReplicateNode : public VTransformNode {
 public:
   VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -685,6 +693,7 @@ class VTransformReplicateNode : public VTransformNode {
 class VTransformConvI2LNode : public VTransformNode {
 public:
   VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
 };
@@ -699,6 +708,7 @@ class VTransformShiftCountNode : public VTransformNode {
 public:
   VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -712,6 +722,7 @@ class VTransformPopulateIndexNode : public VTransformNode {
 public:
   VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -777,6 +788,7 @@ class VTransformElementWiseVectorNode : public VTransformVectorNode {
   VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) :
     VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {}
   virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -789,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect
 public:
   VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
     VTransformVectorNode(vtransform, 2, properties) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };)
 };
@@ -799,6 +812,7 @@ class VTransformReinterpretVectorNode : public VTransformVectorNode {
 public:
   VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) :
     VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -819,6 +833,7 @@ class VTransformCmpVectorNode : public VTransformVectorNode {
   VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
     VTransformVectorNode(vtransform, 3, properties) {}
   virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); }
   NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };)
 };
@@ -831,6 +846,7 @@ class VTransformBoolVectorNode : public VTransformVectorNode {
     VTransformVectorNode(vtransform, 2, properties), _test(test) {}
   VTransformBoolTest test() const { return _test; }
   virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -843,6 +859,7 @@ class VTransformReductionVectorNode : public VTransformVectorNode {
     VTransformVectorNode(vtransform, 3, properties) {}
   virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
   virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
 
@@ -885,6 +902,7 @@ class VTransformLoadVectorNode : public VTransformMemVectorNode {
   LoadNode::ControlDependency control_dependency() const;
   virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
   virtual bool is_load_in_loop() const override { return true; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
 };
@@ -896,6 +914,7 @@ class VTransformStoreVectorNode : public VTransformMemVectorNode {
     VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {}
   virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
   virtual bool is_load_in_loop() const override { return false; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
 };

From 49f9242c09c57ee133d8d76762a2f653cb9ad58b Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 22 Oct 2025 08:56:35 +0200
Subject: [PATCH 08/39] impl more cost methods

---
 src/hotspot/share/opto/vtransform.cpp | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 542bc194511b5..cec4cff7fb49b 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -1075,10 +1075,9 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
 }
 
 float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  // // TODO: implement, consider cast etc.
-  // return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()) +
-  //        vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), XXX);
-  return 2;
+  int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
+  return vloop_analyzer.cost_for_vector(vopc, vector_length(), element_basic_type()) +
+         vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), T_INT);
 }
 
 VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1097,9 +1096,7 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
 }
 
 float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  // TODO: implement
-  //return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type());
-  return 1;
+  return vloop_analyzer.cost_for_vector(Op_VectorReinterpret, vector_length(), element_basic_type());
 }
 
 VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {

From b32afed895aa72d39c07fe149bb8a7e51cb13c76 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 22 Oct 2025 09:03:15 +0200
Subject: [PATCH 09/39] fix comment

---
 src/hotspot/share/opto/superword.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index f8d247bf6dd32..e359fa87e10c0 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1899,7 +1899,8 @@ bool SuperWord::do_vtransform() const {
   return true;
 }
 
-// TODO: move to other file
+// Check Cost-Model, and other heuristics.
+// Can be overridden with AutoVectorizationOverrideProfitability.
 bool VTransform::is_profitable() const {
   assert(_graph.is_scheduled(), "must already be scheduled");
 

From a77059f8e0f3661da7423e225ba834cab56d53e4 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 22 Oct 2025 09:07:18 +0200
Subject: [PATCH 10/39] scalar cost

---
 src/hotspot/share/opto/vectorization.cpp | 57 +++++++++++-------------
 1 file changed, 27 insertions(+), 30 deletions(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 0f59f746f21d1..31a28cca305c9 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -565,37 +565,34 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const {
 
 // Compute the cost over all operations in the (scalar) loop.
 float VLoopAnalyzer::cost() const {
-  return 0;
-}
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVLoopAnalyzer::cost:");
+  }
+#endif
 
-// TODO: impl
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("\nVLoopAnalyzer::cost:");
-//  }
-//#endif
-//
-//  float sum = 0;
-//  for (int j = 0; j < body().body().length(); j++) {
-//    Node* n = body().body().at(j);
-//    if (!has_zero_cost(n)) {
-//      float c = cost_for_scalar(n->Opcode());
-//      sum += c;
-//#ifndef PRODUCT
-//      if (_vloop.is_trace_cost_verbose()) {
-//        tty->print_cr("  -> cost = %.2f for %d %s", c, n->_idx, n->Name());
-//      }
-//#endif
-//    }
-//  }
-//
-//#ifndef PRODUCT
-//  if (_vloop.is_trace_cost()) {
-//    tty->print_cr("  total_cost = %.2f", sum);
-//  }
-//#endif
-//  return sum;
-//}
+  float sum = 0;
+  // TODO: does this go over the whole loop, or just the basic block?
+  for (int j = 0; j < body().body().length(); j++) {
+    Node* n = body().body().at(j);
+    if (!has_zero_cost(n)) {
+      float c = cost_for_scalar(n->Opcode());
+      sum += c;
+#ifndef PRODUCT
+      if (_vloop.is_trace_cost_verbose()) {
+        tty->print_cr("  -> cost = %.2f for %d %s", c, n->_idx, n->Name());
+      }
+#endif
+    }
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
+  return sum;
+}
 
 float VLoopAnalyzer::cost_for_scalar(int opcode) const {
   float c = Matcher::cost_for_scalar(opcode);

From a8f11c496f02d778aa0455d8cba8afcbaf5c29ee Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 08:13:00 +0200
Subject: [PATCH 11/39] ignore pointer expression nodes

---
 .../share/opto/traceAutoVectorizationTag.hpp  |  2 +-
 src/hotspot/share/opto/vectorization.cpp      |  5 ++-
 src/hotspot/share/opto/vectorization.hpp      | 36 ++++++++++++++++++-
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index aac3d09f44995..4f67aff9b0706 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -38,7 +38,7 @@
   flags(MEMORY_SLICES,              "Trace VLoopMemorySlices") \
   flags(BODY,                       "Trace VLoopBody") \
   flags(TYPES,                      "Trace VLoopTypes") \
-  flags(POINTERS,                   "Trace VLoopPointers") \
+  flags(POINTERS,                   "Trace VLoopVPointers") \
   flags(DEPENDENCY_GRAPH,           "Trace VLoopDependencyGraph") \
   flags(SW_ADJACENT_MEMOPS,         "Trace SuperWord::find_adjacent_memop_pairs") \
   flags(SW_REJECTIONS,              "Trace SuperWord rejections (non vectorizations)") \
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 31a28cca305c9..b2cc4200015bb 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
   int pointers_idx = 0;
   _body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
     // Placement new: construct directly into the array.
-    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
+    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
     _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
     pointers_idx++;
   });
@@ -548,8 +548,7 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const {
 
   // Internal nodes of pointer expressions are most likely folded into
   // the load / store and have no additional cost.
-  // TODO: implement
-  // if (vpointers().is_in_pointer_expression(n)) { return true; }
+  if (vpointers().is_in_pointer_expression(n)) { return true; }
 
   if (n->is_AddP() || // Pointer expression
       n->is_CFG() ||  // CFG
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 89f5778a45335..419c29d6544a5 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -592,6 +592,32 @@ class VLoopTypes : public StackObj {
   const Type* container_type(Node* n) const;
 };
 
+// Mark all nodes from the loop that are part of any VPointer expression.
+class PointerExpressionNodes : public MemPointerParserCallback {
+private:
+  const VLoop&     _vloop;
+  const VLoopBody& _body;
+  VectorSet        _in_pointer_expression;
+
+public:
+  PointerExpressionNodes(Arena* arena,
+                         const VLoop& vloop,
+                         const VLoopBody& body) :
+    _vloop(vloop),
+    _body(body),
+    _in_pointer_expression(arena) {}
+
+  virtual void callback(Node* n) override {
+    if (!_vloop.in_bb(n)) { return; }
+    _in_pointer_expression.set(_body.bb_idx(n));
+  }
+
+  bool contains(const Node* n) const {
+    if (!_vloop.in_bb(n)) { return false; }
+    return _in_pointer_expression.test(_body.bb_idx(n));
+  }
+};
+
 // Submodule of VLoopAnalyzer.
 // We compute and cache the VPointer for every load and store.
 class VLoopVPointers : public StackObj {
@@ -607,6 +633,9 @@ class VLoopVPointers : public StackObj {
   // Map bb_idx -> index in _vpointers. -1 if not mapped.
   GrowableArray<int> _bb_idx_to_vpointer;
 
+  // Mark all nodes that are part of any pointers expression.
+  PointerExpressionNodes _pointer_expression_nodes;
+
 public:
   VLoopVPointers(Arena* arena,
                  const VLoop& vloop,
@@ -618,13 +647,18 @@ class VLoopVPointers : public StackObj {
     _bb_idx_to_vpointer(arena,
                         vloop.estimated_body_length(),
                         vloop.estimated_body_length(),
-                        -1) {}
+                        -1),
+    _pointer_expression_nodes(arena, _vloop, _body) {}
   NONCOPYABLE(VLoopVPointers);
 
   void compute_vpointers();
   const VPointer& vpointer(const MemNode* mem) const;
   NOT_PRODUCT( void print() const; )
 
+  bool is_in_pointer_expression(const Node* n) const {
+    return _pointer_expression_nodes.contains(n);
+  }
+
 private:
   void count_vpointers();
   void allocate_vpointers_array();

From 693dcf1aca9bad6df66bf5a1fae61b0d5896f78e Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 09:37:45 +0200
Subject: [PATCH 12/39] zero cost for data scalar nodes that have zero cost

---
 src/hotspot/share/opto/vectorization.cpp |  1 +
 src/hotspot/share/opto/vtransform.cpp    | 10 +++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index b2cc4200015bb..79320948d2110 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -541,6 +541,7 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
+// TODO: Description
 bool VLoopAnalyzer::has_zero_cost(Node* n) const {
   // Outside body?
   if (!_vloop.in_bb(n)) { return true; }
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index cec4cff7fb49b..0fca37ca39ba3 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -923,6 +923,8 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
 }
 
 float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // This is an identity transform, but loads and stores must be counted.
+  assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
   return vloop_analyzer.cost_for_scalar(_node->Opcode());
 }
 
@@ -939,7 +941,13 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
 }
 
 float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_scalar(_node->Opcode());
+  // Since this is an identity transform, we may have nodes that also
+  // VLoopAnalyzer::cost does not count for the scalar loop.
+  if (vloop_analyzer.has_zero_cost(_node)) {
+    return 0;
+  } else {
+    return vloop_analyzer.cost_for_scalar(_node->Opcode());
+  }
 }
 
 VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {

From 2a9aba2bb5d4aeb8a108d4f6c84da84d698fa029 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 10:00:48 +0200
Subject: [PATCH 13/39] improve documentation and fix test

---
 src/hotspot/share/opto/vectorization.cpp        |  8 +++++++-
 ...tAutoVectorizationOverrideProfitability.java | 17 +++++++++--------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 79320948d2110..3527e502afb7a 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -541,7 +541,8 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
-// TODO: Description
+// Cost-model heuristic for nodes that do not contribute to computatinal
+// cost inside the loop.
 bool VLoopAnalyzer::has_zero_cost(Node* n) const {
   // Outside body?
   if (!_vloop.in_bb(n)) { return true; }
@@ -551,6 +552,11 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const {
   // the load / store and have no additional cost.
   if (vpointers().is_in_pointer_expression(n)) { return true; }
 
+  // Not all AddP nodes can be detected in VPointer parsing, so
+  // we filter them out here.
+  // We don't want to explicitly model the cost of control flow,
+  // since we have the same CFG structure before and after
+  // vectorization: A loop head, a loop exit, with a backedge.
   if (n->is_AddP() || // Pointer expression
       n->is_CFG() ||  // CFG
       n->is_Phi() ||  // CFG
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
index 10ad19d03a74d..89b46871cb56a 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
@@ -115,17 +115,18 @@ public static void checkSimpleFloatCopy() {
     @Test
     @Warmup(10)
     @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
         counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
     @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
         counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
-    // Current heuristics say that this simple int reduction is not profitable.
-    // But it would actually be profitable, since we are able to move the
-    // reduction out of the loop (we can reorder the reduction). When moving
-    // the reduction out of the loop, we instead accumulate with a simple
-    // ADD_VI inside the loop.
-    // See: JDK-8307516 JDK-8345044
+    // We are able to vectorize the reduction. But on its own, that would
+    // not reduce the cost sufficiently in all cases, because vectorized
+    // reduction nodes are expensive. But since integer addition is associative
+    // we can move the reduction vector out of the loop. Instead, we accumulate
+    // with a simple ADD_VI inside the loop, which is very cheap. After the
+    // loop, we only need to use the vectorized reduction once, to collapse
+    // the partial sums contained in the lanes.
     private static int simpleIntReduction() {
         int sum = 0;
         for (int i = 0; i < aI.length; i++) {

From baa41e4b8f85173795db7b47ae062a8e1731eccd Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 10:51:48 +0200
Subject: [PATCH 14/39] fix another test

---
 .../jtreg/compiler/c2/cr7200264/TestIntVect.java       | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
index 457e33667b2d1..76c33ec1b0772 100644
--- a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
@@ -410,12 +410,12 @@ public void run() {
 
     }
 
-    // Not vectorized: simple addition not profitable, see JDK-8307516. NOTE:
-    // This check does not document the _desired_ behavior of the system but
-    // the current behavior (no vectorization)
     @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_I, "= 0",
-                   IRNode.STORE_VECTOR,  "= 0" })
+    @IR(counts = { IRNode.LOAD_VECTOR_I,     "> 0",
+                   IRNode.ADD_REDUCTION_VI,  "> 0",
+                   IRNode.ADD_VI,            "> 0" })
+    // The reduction is moved outside the loop, and we use a
+    // element-wise accumulator inside the loop.
     int test_sum(int[] a1) {
         int sum = 0;
         for (int i = 0; i < a1.length; i+=1) {

From 24a6c338724c074171595f283a12eccad4d76eba Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 10:59:52 +0200
Subject: [PATCH 15/39] resolve some todos

---
 src/hotspot/share/opto/vtransform.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 0fca37ca39ba3..954a915cbaea4 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -206,7 +206,6 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
     VTransformNode* vtn = _schedule.at(i);
     // Is vtn a loop-phi?
     if (vtn->isa_LoopPhi() != nullptr ||
-        // TODO: what about VTransformCountedLoopNode?
         vtn->is_load_or_store_in_loop()) {
       is_not_before_loop.set(vtn->_idx);
       continue;
@@ -239,7 +238,6 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
         break;
       }
     }
-    // TODO: what about CFG nodes?
   }
 }
 

From 5373397ede7410fd7995458a09ad535b973209e7 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 23 Oct 2025 11:05:32 +0200
Subject: [PATCH 16/39] resolve more TODOS

---
 src/hotspot/share/opto/vectorization.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 3527e502afb7a..15e4248cf409a 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -546,7 +546,6 @@ void VLoopDependencyGraph::PredsIterator::next() {
 bool VLoopAnalyzer::has_zero_cost(Node* n) const {
   // Outside body?
   if (!_vloop.in_bb(n)) { return true; }
-  // TODO: can we widen this to the loop, not just bb?
 
   // Internal nodes of pointer expressions are most likely folded into
   // the load / store and have no additional cost.
@@ -578,7 +577,6 @@ float VLoopAnalyzer::cost() const {
 #endif
 
   float sum = 0;
-  // TODO: does this go over the whole loop, or just the basic block?
   for (int j = 0; j < body().body().length(); j++) {
     Node* n = body().body().at(j);
     if (!has_zero_cost(n)) {

From f0d9fa285e3f4a28efc7f6f0f25d6d479cc19b7e Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 16:11:51 +0200
Subject: [PATCH 17/39] wip reductions IR test

---
 .../loopopts/superword/TestReductions.java    | 176 ++++++++++++++++++
 1 file changed, 176 insertions(+)
 create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
new file mode 100644
index 0000000000000..fcabe4963f029
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions xxxx
+ */
+
+package compiler.loopopts.superword;
+
+import jdk.test.lib.Utils;
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Random;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import static compiler.lib.generators.Generators.G;
+import compiler.lib.generators.Generator;
+
+/**
+ * Note: there is a corresponding JMH benchmark:
+ * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
+ */
+public class TestReductions {
+    static int SIZE = 1024*8;
+    private static final Random RANDOM = Utils.getRandomInstance();
+    public static final Generator<Integer> GEN_I = G.ints();
+    public static final Generator<Float>   GEN_F = G.floats();
+
+    private static byte[] in1B   = fillRandom(new byte[SIZE]);
+    private static byte[] in2B   = fillRandom(new byte[SIZE]);
+    private static byte[] in3B   = fillRandom(new byte[SIZE]);
+    //private static char[] in1C   = fillRandom(new char[SIZE]);
+    //private static char[] in2C   = fillRandom(new char[SIZE]);
+    //private static char[] in3C   = fillRandom(new char[SIZE]);
+    //private static short[] in1S  = fillRandom(new short[SIZE]);
+    //private static short[] in2S  = fillRandom(new short[SIZE]);
+    //private static short[] in3S  = fillRandom(new short[SIZE]);
+
+    private static int[] in1I    = fillRandom(new int[SIZE]);
+    private static int[] in2I    = fillRandom(new int[SIZE]);
+    private static int[] in3I    = fillRandom(new int[SIZE]);
+    //private static long[] in1L   = fillRandom(new long[SIZE]);
+    //private static long[] in2L   = fillRandom(new long[SIZE]);
+    //private static long[] in3L   = fillRandom(new long[SIZE]);
+
+    //private static float[] in1F  = fillRandom(new float[SIZE]);
+    //private static float[] in2F  = fillRandom(new float[SIZE]);
+    //private static float[] in3F  = fillRandom(new float[SIZE]);
+    //private static double[] in1D = fillRandom(new doulbe[SIZE]);
+    //private static double[] in2D = fillRandom(new doulbe[SIZE]);
+    //private static double[] in3D = fillRandom(new doulbe[SIZE]);
+
+    interface TestFunction {
+        Object run();
+    }
+
+    // Map of test names to tests.
+    Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
+
+    // Map of gold, the results from the first run (before compilation), one per tests entry.
+    Map<String,Object> golds = new HashMap<String,Object>();
+
+    public static void main(String[] args) {
+        TestFramework framework = new TestFramework(TestReductions.class);
+        switch (args[0]) {
+            case "xxxx" -> { framework.addFlags("-XX:-AlignVector"); }
+            default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
+        };
+        framework.start();
+    }
+
+    public TestReductions() {
+        // Add all tests to list
+        tests.put("test1", TestReductions::test1);
+        tests.put("test2", TestReductions::test2);
+
+        // Compute gold value for all test methods before compilation
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            Object gold = test.run();
+            golds.put(name, gold);
+        }
+    }
+
+    @Warmup(100)
+    @Run(test = {"test1",
+                 "test2"})
+    public void runTests() {
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            // Recall gold value from before compilation
+            Object gold = golds.get(name);
+            // Compute new result
+            Object result = test.run();
+            // Compare gold and new result
+            try {
+                Verify.checkEQ(gold, result);
+            } catch (VerifyException e) {
+                throw new RuntimeException("Verify failed for " + name, e);
+            }
+        }
+    }
+
+    static byte[] fillRandom(byte[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (byte)(int)GEN_I.next();
+        }
+        return a;
+    }
+
+    static int[] fillRandom(int[] a) {
+        G.fill(GEN_I, a);
+        return a;
+    }
+
+    @Test
+    // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
+    //               IRNode.STORE_VECTOR, "> 0",
+    //               ".*multiversion.*", "= 0"},
+    //     phase = CompilePhase.PRINT_IDEAL,
+    //     applyIfPlatform = {"64-bit", "true"},
+    //     applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // // Should always vectorize, no speculative runtime check required.
+    static byte test1() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
+    //               IRNode.STORE_VECTOR, "> 0",
+    //               ".*multiversion.*", "= 0"},
+    //     phase = CompilePhase.PRINT_IDEAL,
+    //     applyIfPlatform = {"64-bit", "true"},
+    //     applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
+    // // Should always vectorize, no speculative runtime check required.
+    static byte test2() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc *= val;
+        }
+        return acc;
+    }
+}

From 35eec33c11c8cd1d86a81551d42829f9ea21d42f Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 16:12:09 +0200
Subject: [PATCH 18/39] linking comment

---
 .../micro/org/openjdk/bench/vm/compiler/VectorReduction2.java | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
index ec614cb324bc2..63fbf03008301 100644
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
@@ -28,6 +28,10 @@
 import java.util.concurrent.TimeUnit;
 import java.util.Random;
 
+/**
+ * Note: there is a corresponding IR test:
+ * test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+ */
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @State(Scope.Thread)

From 8e4a2ce0f658b91c66401b5a17fa4fa70c18a41b Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 16:17:45 +0200
Subject: [PATCH 19/39] wip test

---
 .../loopopts/superword/TestReductions.java    | 64 +++++++++++++------
 1 file changed, 46 insertions(+), 18 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index fcabe4963f029..4072cc1fc3505 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -31,10 +31,8 @@
 
 package compiler.loopopts.superword;
 
-import jdk.test.lib.Utils;
 import java.util.Map;
 import java.util.HashMap;
-import java.util.Random;
 
 import compiler.lib.ir_framework.*;
 import compiler.lib.verify.*;
@@ -47,33 +45,34 @@
  */
 public class TestReductions {
     static int SIZE = 1024*8;
-    private static final Random RANDOM = Utils.getRandomInstance();
     public static final Generator<Integer> GEN_I = G.ints();
+    public static final Generator<Long>    GEN_L = G.longs();
     public static final Generator<Float>   GEN_F = G.floats();
+    public static final Generator<Double>  GEN_D = G.doubles();
 
     private static byte[] in1B   = fillRandom(new byte[SIZE]);
     private static byte[] in2B   = fillRandom(new byte[SIZE]);
     private static byte[] in3B   = fillRandom(new byte[SIZE]);
-    //private static char[] in1C   = fillRandom(new char[SIZE]);
-    //private static char[] in2C   = fillRandom(new char[SIZE]);
-    //private static char[] in3C   = fillRandom(new char[SIZE]);
-    //private static short[] in1S  = fillRandom(new short[SIZE]);
-    //private static short[] in2S  = fillRandom(new short[SIZE]);
-    //private static short[] in3S  = fillRandom(new short[SIZE]);
+    private static char[] in1C   = fillRandom(new char[SIZE]);
+    private static char[] in2C   = fillRandom(new char[SIZE]);
+    private static char[] in3C   = fillRandom(new char[SIZE]);
+    private static short[] in1S  = fillRandom(new short[SIZE]);
+    private static short[] in2S  = fillRandom(new short[SIZE]);
+    private static short[] in3S  = fillRandom(new short[SIZE]);
 
     private static int[] in1I    = fillRandom(new int[SIZE]);
     private static int[] in2I    = fillRandom(new int[SIZE]);
     private static int[] in3I    = fillRandom(new int[SIZE]);
-    //private static long[] in1L   = fillRandom(new long[SIZE]);
-    //private static long[] in2L   = fillRandom(new long[SIZE]);
-    //private static long[] in3L   = fillRandom(new long[SIZE]);
+    private static long[] in1L   = fillRandom(new long[SIZE]);
+    private static long[] in2L   = fillRandom(new long[SIZE]);
+    private static long[] in3L   = fillRandom(new long[SIZE]);
 
-    //private static float[] in1F  = fillRandom(new float[SIZE]);
-    //private static float[] in2F  = fillRandom(new float[SIZE]);
-    //private static float[] in3F  = fillRandom(new float[SIZE]);
-    //private static double[] in1D = fillRandom(new doulbe[SIZE]);
-    //private static double[] in2D = fillRandom(new doulbe[SIZE]);
-    //private static double[] in3D = fillRandom(new doulbe[SIZE]);
+    private static float[] in1F  = fillRandom(new float[SIZE]);
+    private static float[] in2F  = fillRandom(new float[SIZE]);
+    private static float[] in3F  = fillRandom(new float[SIZE]);
+    private static double[] in1D = fillRandom(new double[SIZE]);
+    private static double[] in2D = fillRandom(new double[SIZE]);
+    private static double[] in3D = fillRandom(new double[SIZE]);
 
     interface TestFunction {
         Object run();
@@ -135,11 +134,40 @@ static byte[] fillRandom(byte[] a) {
         return a;
     }
 
+    static char[] fillRandom(char[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (char)(int)GEN_I.next();
+        }
+        return a;
+    }
+
+    static short[] fillRandom(short[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (short)(int)GEN_I.next();
+        }
+        return a;
+    }
+
     static int[] fillRandom(int[] a) {
         G.fill(GEN_I, a);
         return a;
     }
 
+    static long[] fillRandom(long[] a) {
+        G.fill(GEN_L, a);
+        return a;
+    }
+
+    static float[] fillRandom(float[] a) {
+        G.fill(GEN_F, a);
+        return a;
+    }
+
+    static double[] fillRandom(double[] a) {
+        G.fill(GEN_D, a);
+        return a;
+    }
+
     @Test
     // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
     //               IRNode.STORE_VECTOR, "> 0",

From ed16cf6cc9f8a5b72296c67dd1481bb6f31d449f Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 16:38:23 +0200
Subject: [PATCH 20/39] added tests

---
 .../loopopts/superword/TestReductions.java    | 1597 ++++++++++++++++-
 1 file changed, 1571 insertions(+), 26 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 4072cc1fc3505..6266c08386f28 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -44,11 +44,11 @@
  * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
  */
 public class TestReductions {
-    static int SIZE = 1024*8;
-    public static final Generator<Integer> GEN_I = G.ints();
-    public static final Generator<Long>    GEN_L = G.longs();
-    public static final Generator<Float>   GEN_F = G.floats();
-    public static final Generator<Double>  GEN_D = G.doubles();
+    private static int SIZE = 1024*8;
+    private static final Generator<Integer> GEN_I = G.ints();
+    private static final Generator<Long>    GEN_L = G.longs();
+    private static final Generator<Float>   GEN_F = G.floats();
+    private static final Generator<Double>  GEN_D = G.doubles();
 
     private static byte[] in1B   = fillRandom(new byte[SIZE]);
     private static byte[] in2B   = fillRandom(new byte[SIZE]);
@@ -95,8 +95,141 @@ public static void main(String[] args) {
 
     public TestReductions() {
         // Add all tests to list
-        tests.put("test1", TestReductions::test1);
-        tests.put("test2", TestReductions::test2);
+        tests.put("byteAndSimple",       TestReductions::byteAndSimple);
+        tests.put("byteOrSimple",        TestReductions::byteOrSimple);
+        tests.put("byteXorSimple",       TestReductions::byteXorSimple);
+        tests.put("byteAddSimple",       TestReductions::byteAddSimple);
+        tests.put("byteMulSimple",       TestReductions::byteMulSimple);
+        tests.put("byteMinSimple",       TestReductions::byteMinSimple);
+        tests.put("byteMaxSimple",       TestReductions::byteMaxSimple);
+        tests.put("byteAndDotProduct",   TestReductions::byteAndDotProduct);
+        tests.put("byteOrDotProduct",    TestReductions::byteOrDotProduct);
+        tests.put("byteXorDotProduct",   TestReductions::byteXorDotProduct);
+        tests.put("byteAddDotProduct",   TestReductions::byteAddDotProduct);
+        tests.put("byteMulDotProduct",   TestReductions::byteMulDotProduct);
+        tests.put("byteMinDotProduct",   TestReductions::byteMinDotProduct);
+        tests.put("byteMaxDotProduct",   TestReductions::byteMaxDotProduct);
+        tests.put("byteAndBig",          TestReductions::byteAndBig);
+        tests.put("byteOrBig",           TestReductions::byteOrBig);
+        tests.put("byteXorBig",          TestReductions::byteXorBig);
+        tests.put("byteAddBig",          TestReductions::byteAddBig);
+        tests.put("byteMulBig",          TestReductions::byteMulBig);
+        tests.put("byteMinBig",          TestReductions::byteMinBig);
+        tests.put("byteMaxBig",          TestReductions::byteMaxBig);
+
+        tests.put("charAndSimple",       TestReductions::charAndSimple);
+        tests.put("charOrSimple",        TestReductions::charOrSimple);
+        tests.put("charXorSimple",       TestReductions::charXorSimple);
+        tests.put("charAddSimple",       TestReductions::charAddSimple);
+        tests.put("charMulSimple",       TestReductions::charMulSimple);
+        tests.put("charMinSimple",       TestReductions::charMinSimple);
+        tests.put("charMaxSimple",       TestReductions::charMaxSimple);
+        tests.put("charAndDotProduct",   TestReductions::charAndDotProduct);
+        tests.put("charOrDotProduct",    TestReductions::charOrDotProduct);
+        tests.put("charXorDotProduct",   TestReductions::charXorDotProduct);
+        tests.put("charAddDotProduct",   TestReductions::charAddDotProduct);
+        tests.put("charMulDotProduct",   TestReductions::charMulDotProduct);
+        tests.put("charMinDotProduct",   TestReductions::charMinDotProduct);
+        tests.put("charMaxDotProduct",   TestReductions::charMaxDotProduct);
+        tests.put("charAndBig",          TestReductions::charAndBig);
+        tests.put("charOrBig",           TestReductions::charOrBig);
+        tests.put("charXorBig",          TestReductions::charXorBig);
+        tests.put("charAddBig",          TestReductions::charAddBig);
+        tests.put("charMulBig",          TestReductions::charMulBig);
+        tests.put("charMinBig",          TestReductions::charMinBig);
+        tests.put("charMaxBig",          TestReductions::charMaxBig);
+
+        tests.put("shortAndSimple",      TestReductions::shortAndSimple);
+        tests.put("shortOrSimple",       TestReductions::shortOrSimple);
+        tests.put("shortXorSimple",      TestReductions::shortXorSimple);
+        tests.put("shortAddSimple",      TestReductions::shortAddSimple);
+        tests.put("shortMulSimple",      TestReductions::shortMulSimple);
+        tests.put("shortMinSimple",      TestReductions::shortMinSimple);
+        tests.put("shortMaxSimple",      TestReductions::shortMaxSimple);
+        tests.put("shortAndDotProduct",  TestReductions::shortAndDotProduct);
+        tests.put("shortOrDotProduct",   TestReductions::shortOrDotProduct);
+        tests.put("shortXorDotProduct",  TestReductions::shortXorDotProduct);
+        tests.put("shortAddDotProduct",  TestReductions::shortAddDotProduct);
+        tests.put("shortMulDotProduct",  TestReductions::shortMulDotProduct);
+        tests.put("shortMinDotProduct",  TestReductions::shortMinDotProduct);
+        tests.put("shortMaxDotProduct",  TestReductions::shortMaxDotProduct);
+        tests.put("shortAndBig",         TestReductions::shortAndBig);
+        tests.put("shortOrBig",          TestReductions::shortOrBig);
+        tests.put("shortXorBig",         TestReductions::shortXorBig);
+        tests.put("shortAddBig",         TestReductions::shortAddBig);
+        tests.put("shortMulBig",         TestReductions::shortMulBig);
+        tests.put("shortMinBig",         TestReductions::shortMinBig);
+        tests.put("shortMaxBig",         TestReductions::shortMaxBig);
+
+        tests.put("intAndSimple",        TestReductions::intAndSimple);
+        tests.put("intOrSimple",         TestReductions::intOrSimple);
+        tests.put("intXorSimple",        TestReductions::intXorSimple);
+        tests.put("intAddSimple",        TestReductions::intAddSimple);
+        tests.put("intMulSimple",        TestReductions::intMulSimple);
+        tests.put("intMinSimple",        TestReductions::intMinSimple);
+        tests.put("intMaxSimple",        TestReductions::intMaxSimple);
+        tests.put("intAndDotProduct",    TestReductions::intAndDotProduct);
+        tests.put("intOrDotProduct",     TestReductions::intOrDotProduct);
+        tests.put("intXorDotProduct",    TestReductions::intXorDotProduct);
+        tests.put("intAddDotProduct",    TestReductions::intAddDotProduct);
+        tests.put("intMulDotProduct",    TestReductions::intMulDotProduct);
+        tests.put("intMinDotProduct",    TestReductions::intMinDotProduct);
+        tests.put("intMaxDotProduct",    TestReductions::intMaxDotProduct);
+        tests.put("intAndBig",           TestReductions::intAndBig);
+        tests.put("intOrBig",            TestReductions::intOrBig);
+        tests.put("intXorBig",           TestReductions::intXorBig);
+        tests.put("intAddBig",           TestReductions::intAddBig);
+        tests.put("intMulBig",           TestReductions::intMulBig);
+        tests.put("intMinBig",           TestReductions::intMinBig);
+        tests.put("intMaxBig",           TestReductions::intMaxBig);
+
+        tests.put("longAndSimple",       TestReductions::longAndSimple);
+        tests.put("longOrSimple",        TestReductions::longOrSimple);
+        tests.put("longXorSimple",       TestReductions::longXorSimple);
+        tests.put("longAddSimple",       TestReductions::longAddSimple);
+        tests.put("longMulSimple",       TestReductions::longMulSimple);
+        tests.put("longMinSimple",       TestReductions::longMinSimple);
+        tests.put("longMaxSimple",       TestReductions::longMaxSimple);
+        tests.put("longAndDotProduct",   TestReductions::longAndDotProduct);
+        tests.put("longOrDotProduct",    TestReductions::longOrDotProduct);
+        tests.put("longXorDotProduct",   TestReductions::longXorDotProduct);
+        tests.put("longAddDotProduct",   TestReductions::longAddDotProduct);
+        tests.put("longMulDotProduct",   TestReductions::longMulDotProduct);
+        tests.put("longMinDotProduct",   TestReductions::longMinDotProduct);
+        tests.put("longMaxDotProduct",   TestReductions::longMaxDotProduct);
+        tests.put("longAndBig",          TestReductions::longAndBig);
+        tests.put("longOrBig",           TestReductions::longOrBig);
+        tests.put("longXorBig",          TestReductions::longXorBig);
+        tests.put("longAddBig",          TestReductions::longAddBig);
+        tests.put("longMulBig",          TestReductions::longMulBig);
+        tests.put("longMinBig",          TestReductions::longMinBig);
+        tests.put("longMaxBig",          TestReductions::longMaxBig);
+
+        tests.put("floatAddSimple",      TestReductions::floatAddSimple);
+        tests.put("floatMulSimple",      TestReductions::floatMulSimple);
+        tests.put("floatMinSimple",      TestReductions::floatMinSimple);
+        tests.put("floatMaxSimple",      TestReductions::floatMaxSimple);
+        tests.put("floatAddDotProduct",  TestReductions::floatAddDotProduct);
+        tests.put("floatMulDotProduct",  TestReductions::floatMulDotProduct);
+        tests.put("floatMinDotProduct",  TestReductions::floatMinDotProduct);
+        tests.put("floatMaxDotProduct",  TestReductions::floatMaxDotProduct);
+        tests.put("floatAddBig",         TestReductions::floatAddBig);
+        tests.put("floatMulBig",         TestReductions::floatMulBig);
+        tests.put("floatMinBig",         TestReductions::floatMinBig);
+        tests.put("floatMaxBig",         TestReductions::floatMaxBig);
+
+        tests.put("doubleAddSimple",     TestReductions::doubleAddSimple);
+        tests.put("doubleMulSimple",     TestReductions::doubleMulSimple);
+        tests.put("doubleMinSimple",     TestReductions::doubleMinSimple);
+        tests.put("doubleMaxSimple",     TestReductions::doubleMaxSimple);
+        tests.put("doubleAddDotProduct", TestReductions::doubleAddDotProduct);
+        tests.put("doubleMulDotProduct", TestReductions::doubleMulDotProduct);
+        tests.put("doubleMinDotProduct", TestReductions::doubleMinDotProduct);
+        tests.put("doubleMaxDotProduct", TestReductions::doubleMaxDotProduct);
+        tests.put("doubleAddBig",        TestReductions::doubleAddBig);
+        tests.put("doubleMulBig",        TestReductions::doubleMulBig);
+        tests.put("doubleMinBig",        TestReductions::doubleMinBig);
+        tests.put("doubleMaxBig",        TestReductions::doubleMaxBig);
 
         // Compute gold value for all test methods before compilation
         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
@@ -108,8 +241,141 @@ public TestReductions() {
     }
 
     @Warmup(100)
-    @Run(test = {"test1",
-                 "test2"})
+    @Run(test = {"byteAndSimple",
+                 "byteOrSimple",
+                 "byteXorSimple",
+                 "byteAddSimple",
+                 "byteMulSimple",
+                 "byteMinSimple",
+                 "byteMaxSimple",
+                 "byteAndDotProduct",
+                 "byteOrDotProduct",
+                 "byteXorDotProduct",
+                 "byteAddDotProduct",
+                 "byteMulDotProduct",
+                 "byteMinDotProduct",
+                 "byteMaxDotProduct",
+                 "byteAndBig",
+                 "byteOrBig",
+                 "byteXorBig",
+                 "byteAddBig",
+                 "byteMulBig",
+                 "byteMinBig",
+                 "byteMaxBig",
+
+                 "charAndSimple",
+                 "charOrSimple",
+                 "charXorSimple",
+                 "charAddSimple",
+                 "charMulSimple",
+                 "charMinSimple",
+                 "charMaxSimple",
+                 "charAndDotProduct",
+                 "charOrDotProduct",
+                 "charXorDotProduct",
+                 "charAddDotProduct",
+                 "charMulDotProduct",
+                 "charMinDotProduct",
+                 "charMaxDotProduct",
+                 "charAndBig",
+                 "charOrBig",
+                 "charXorBig",
+                 "charAddBig",
+                 "charMulBig",
+                 "charMinBig",
+                 "charMaxBig",
+
+                 "shortAndSimple",
+                 "shortOrSimple",
+                 "shortXorSimple",
+                 "shortAddSimple",
+                 "shortMulSimple",
+                 "shortMinSimple",
+                 "shortMaxSimple",
+                 "shortAndDotProduct",
+                 "shortOrDotProduct",
+                 "shortXorDotProduct",
+                 "shortAddDotProduct",
+                 "shortMulDotProduct",
+                 "shortMinDotProduct",
+                 "shortMaxDotProduct",
+                 "shortAndBig",
+                 "shortOrBig",
+                 "shortXorBig",
+                 "shortAddBig",
+                 "shortMulBig",
+                 "shortMinBig",
+                 "shortMaxBig",
+
+                 "intAndSimple",
+                 "intOrSimple",
+                 "intXorSimple",
+                 "intAddSimple",
+                 "intMulSimple",
+                 "intMinSimple",
+                 "intMaxSimple",
+                 "intAndDotProduct",
+                 "intOrDotProduct",
+                 "intXorDotProduct",
+                 "intAddDotProduct",
+                 "intMulDotProduct",
+                 "intMinDotProduct",
+                 "intMaxDotProduct",
+                 "intAndBig",
+                 "intOrBig",
+                 "intXorBig",
+                 "intAddBig",
+                 "intMulBig",
+                 "intMinBig",
+                 "intMaxBig",
+
+                 "longAndSimple",
+                 "longOrSimple",
+                 "longXorSimple",
+                 "longAddSimple",
+                 "longMulSimple",
+                 "longMinSimple",
+                 "longMaxSimple",
+                 "longAndDotProduct",
+                 "longOrDotProduct",
+                 "longXorDotProduct",
+                 "longAddDotProduct",
+                 "longMulDotProduct",
+                 "longMinDotProduct",
+                 "longMaxDotProduct",
+                 "longAndBig",
+                 "longOrBig",
+                 "longXorBig",
+                 "longAddBig",
+                 "longMulBig",
+                 "longMinBig",
+                 "longMaxBig",
+
+                 "floatAddSimple",
+                 "floatMulSimple",
+                 "floatMinSimple",
+                 "floatMaxSimple",
+                 "floatAddDotProduct",
+                 "floatMulDotProduct",
+                 "floatMinDotProduct",
+                 "floatMaxDotProduct",
+                 "floatAddBig",
+                 "floatMulBig",
+                 "floatMinBig",
+                 "floatMaxBig",
+
+                 "doubleAddSimple",
+                 "doubleMulSimple",
+                 "doubleMinSimple",
+                 "doubleMaxSimple",
+                 "doubleAddDotProduct",
+                 "doubleMulDotProduct",
+                 "doubleMinDotProduct",
+                 "doubleMaxDotProduct",
+                 "doubleAddBig",
+                 "doubleMulBig",
+                 "doubleMinBig",
+                 "doubleMaxBig"})
     public void runTests() {
         for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
             String name = entry.getKey();
@@ -168,37 +434,1316 @@ static double[] fillRandom(double[] a) {
         return a;
     }
 
+    // ---------byte***Simple ------------------------------------------------------------
+    @Test
+    private static byte byteAndSimple() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
     @Test
-    // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
-    //               IRNode.STORE_VECTOR, "> 0",
-    //               ".*multiversion.*", "= 0"},
-    //     phase = CompilePhase.PRINT_IDEAL,
-    //     applyIfPlatform = {"64-bit", "true"},
-    //     applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
-    // // Should always vectorize, no speculative runtime check required.
-    static byte test1() {
+    private static byte byteOrSimple() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
             byte val = in1B[i];
-            acc += val;
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteXorSimple() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc ^= val;
         }
         return acc;
     }
 
     @Test
-    // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0",
-    //               IRNode.STORE_VECTOR, "> 0",
-    //               ".*multiversion.*", "= 0"},
-    //     phase = CompilePhase.PRINT_IDEAL,
-    //     applyIfPlatform = {"64-bit", "true"},
-    //     applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"})
-    // // Should always vectorize, no speculative runtime check required.
-    static byte test2() {
+    private static byte byteAddSimple() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
             byte val = in1B[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMulSimple() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMinSimple() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMaxSimple() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------byte***DotProduct ------------------------------------------------------------
+    @Test
+    private static byte byteAndDotProduct() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteOrDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteXorDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteAddDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMulDotProduct() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMinDotProduct() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMaxDotProduct() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------byte***Big ------------------------------------------------------------
+    @Test
+    private static byte byteAndBig() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteOrBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteXorBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteAddBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMulBig() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMinBig() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static byte byteMaxBig() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***Simple ------------------------------------------------------------
+    @Test
+    private static char charAndSimple() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charOrSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charXorSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charAddSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMulSimple() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMinSimple() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMaxSimple() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***DotProduct ------------------------------------------------------------
+    @Test
+    private static char charAndDotProduct() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charOrDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charXorDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charAddDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMulDotProduct() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
             acc *= val;
         }
         return acc;
     }
+
+    @Test
+    private static char charMinDotProduct() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMaxDotProduct() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***Big ------------------------------------------------------------
+    @Test
+    private static char charAndBig() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charOrBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charXorBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charAddBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMulBig() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMinBig() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static char charMaxBig() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***Simple ------------------------------------------------------------
+    @Test
+    private static short shortAndSimple() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortOrSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortXorSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortAddSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMulSimple() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMinSimple() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMaxSimple() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***DotProduct ------------------------------------------------------------
+    @Test
+    private static short shortAndDotProduct() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortOrDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortXorDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortAddDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMulDotProduct() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMinDotProduct() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMaxDotProduct() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***Big ------------------------------------------------------------
+    @Test
+    private static short shortAndBig() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortOrBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortXorBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortAddBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMulBig() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMinBig() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static short shortMaxBig() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***Simple ------------------------------------------------------------
+    @Test
+    private static int intAndSimple() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intOrSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intXorSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intAddSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMulSimple() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMinSimple() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMaxSimple() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***DotProduct ------------------------------------------------------------
+    @Test
+    private static int intAndDotProduct() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intOrDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intXorDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intAddDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMulDotProduct() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMinDotProduct() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMaxDotProduct() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***Big ------------------------------------------------------------
+    @Test
+    private static int intAndBig() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intOrBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intXorBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intAddBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMulBig() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMinBig() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static int intMaxBig() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***Simple ------------------------------------------------------------
+    @Test
+    private static long longAndSimple() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longOrSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longXorSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longAddSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMulSimple() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMinSimple() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMaxSimple() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***DotProduct ------------------------------------------------------------
+    @Test
+    private static long longAndDotProduct() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longOrDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longXorDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longAddDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMulDotProduct() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMinDotProduct() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMaxDotProduct() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***Big ------------------------------------------------------------
+    @Test
+    private static long longAndBig() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longOrBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longXorBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longAddBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMulBig() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMinBig() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static long longMaxBig() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***Simple ------------------------------------------------------------
+    @Test
+    private static float floatAddSimple() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMulSimple() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMinSimple() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMaxSimple() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***DotProduct ------------------------------------------------------------
+    @Test
+    private static float floatAddDotProduct() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMulDotProduct() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMinDotProduct() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMaxDotProduct() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***Big ------------------------------------------------------------
+    @Test
+    private static float floatAddBig() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMulBig() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMinBig() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static float floatMaxBig() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***Simple ------------------------------------------------------------
+    @Test
+    private static double doubleAddSimple() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMulSimple() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMinSimple() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMaxSimple() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***DotProduct ------------------------------------------------------------
+    @Test
+    private static double doubleAddDotProduct() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMulDotProduct() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMinDotProduct() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMaxDotProduct() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***Big ------------------------------------------------------------
+    @Test
+    private static double doubleAddBig() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMulBig() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMinBig() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    private static double doubleMaxBig() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+
 }

From 802054a051af71c98c08ba0ff75e79cd7115cb16 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 17:00:06 +0200
Subject: [PATCH 21/39] wip IR rules

---
 .../loopopts/superword/TestReductions.java    | 92 ++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 6266c08386f28..bd8bd2359ed2a 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -26,7 +26,23 @@
  * @bug 8340093
  * @summary Test vectorization of reduction loops.
  * @library /test/lib /
- * @run driver compiler.loopopts.superword.TestReductions xxxx
+ * @run driver compiler.loopopts.superword.TestReductions P0
+ */
+
+/*
+ * @test
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions P1
+ */
+
+/*
+ * @test
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions P2
  */
 
 package compiler.loopopts.superword;
@@ -87,7 +103,9 @@ interface TestFunction {
     public static void main(String[] args) {
         TestFramework framework = new TestFramework(TestReductions.class);
         switch (args[0]) {
-            case "xxxx" -> { framework.addFlags("-XX:-AlignVector"); }
+            case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); }
+            case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); }
+            case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2"); }
             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
         };
         framework.start();
@@ -436,6 +454,7 @@ static double[] fillRandom(double[] a) {
 
     // ---------byte***Simple ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAndSimple() {
         byte acc = (byte)0xFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -446,6 +465,7 @@ private static byte byteAndSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteOrSimple() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -456,6 +476,7 @@ private static byte byteOrSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteXorSimple() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -466,6 +487,7 @@ private static byte byteXorSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAddSimple() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -476,6 +498,7 @@ private static byte byteAddSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMulSimple() {
         byte acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -486,6 +509,7 @@ private static byte byteMulSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMinSimple() {
         byte acc = Byte.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -496,6 +520,7 @@ private static byte byteMinSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMaxSimple() {
         byte acc = Byte.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -507,6 +532,7 @@ private static byte byteMaxSimple() {
 
     // ---------byte***DotProduct ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAndDotProduct() {
         byte acc = (byte)0xFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -517,6 +543,7 @@ private static byte byteAndDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteOrDotProduct() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -527,6 +554,7 @@ private static byte byteOrDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteXorDotProduct() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -537,6 +565,7 @@ private static byte byteXorDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAddDotProduct() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -547,6 +576,7 @@ private static byte byteAddDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMulDotProduct() {
         byte acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -557,6 +587,7 @@ private static byte byteMulDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMinDotProduct() {
         byte acc = Byte.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -567,6 +598,7 @@ private static byte byteMinDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMaxDotProduct() {
         byte acc = Byte.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -578,6 +610,7 @@ private static byte byteMaxDotProduct() {
 
     // ---------byte***Big ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAndBig() {
         byte acc = (byte)0xFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -588,6 +621,7 @@ private static byte byteAndBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteOrBig() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -598,6 +632,7 @@ private static byte byteOrBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteXorBig() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -608,6 +643,7 @@ private static byte byteXorBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteAddBig() {
         byte acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -618,6 +654,7 @@ private static byte byteAddBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMulBig() {
         byte acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -628,6 +665,7 @@ private static byte byteMulBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMinBig() {
         byte acc = Byte.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -638,6 +676,7 @@ private static byte byteMinBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
     private static byte byteMaxBig() {
         byte acc = Byte.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -649,6 +688,7 @@ private static byte byteMaxBig() {
 
     // ---------char***Simple ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAndSimple() {
         char acc = (char)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -659,6 +699,7 @@ private static char charAndSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charOrSimple() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -669,6 +710,7 @@ private static char charOrSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charXorSimple() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -679,6 +721,7 @@ private static char charXorSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAddSimple() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -689,6 +732,7 @@ private static char charAddSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMulSimple() {
         char acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -699,6 +743,7 @@ private static char charMulSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMinSimple() {
         char acc = Character.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -709,6 +754,7 @@ private static char charMinSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMaxSimple() {
         char acc = Character.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -720,6 +766,7 @@ private static char charMaxSimple() {
 
     // ---------char***DotProduct ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAndDotProduct() {
         char acc = (char)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -730,6 +777,7 @@ private static char charAndDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charOrDotProduct() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -740,6 +788,7 @@ private static char charOrDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charXorDotProduct() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -750,6 +799,7 @@ private static char charXorDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAddDotProduct() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -760,6 +810,7 @@ private static char charAddDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMulDotProduct() {
         char acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -770,6 +821,7 @@ private static char charMulDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMinDotProduct() {
         char acc = Character.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -780,6 +832,7 @@ private static char charMinDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMaxDotProduct() {
         char acc = Character.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -791,6 +844,7 @@ private static char charMaxDotProduct() {
 
     // ---------char***Big ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAndBig() {
         char acc = (char)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -801,6 +855,7 @@ private static char charAndBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charOrBig() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -811,6 +866,7 @@ private static char charOrBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charXorBig() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -821,6 +877,7 @@ private static char charXorBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charAddBig() {
         char acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -831,6 +888,7 @@ private static char charAddBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMulBig() {
         char acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -841,6 +899,7 @@ private static char charMulBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMinBig() {
         char acc = Character.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -851,6 +910,7 @@ private static char charMinBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
     private static char charMaxBig() {
         char acc = Character.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -862,6 +922,7 @@ private static char charMaxBig() {
 
     // ---------short***Simple ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAndSimple() {
         short acc = (short)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -872,6 +933,7 @@ private static short shortAndSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortOrSimple() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -882,6 +944,7 @@ private static short shortOrSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortXorSimple() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -892,6 +955,7 @@ private static short shortXorSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAddSimple() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -902,6 +966,7 @@ private static short shortAddSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMulSimple() {
         short acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -912,6 +977,7 @@ private static short shortMulSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMinSimple() {
         short acc = Short.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -922,6 +988,7 @@ private static short shortMinSimple() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMaxSimple() {
         short acc = Short.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -933,6 +1000,7 @@ private static short shortMaxSimple() {
 
     // ---------short***DotProduct ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAndDotProduct() {
         short acc = (short)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -943,6 +1011,7 @@ private static short shortAndDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortOrDotProduct() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -953,6 +1022,7 @@ private static short shortOrDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortXorDotProduct() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -963,6 +1033,7 @@ private static short shortXorDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAddDotProduct() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -973,6 +1044,7 @@ private static short shortAddDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMulDotProduct() {
         short acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -983,6 +1055,7 @@ private static short shortMulDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMinDotProduct() {
         short acc = Short.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -993,6 +1066,7 @@ private static short shortMinDotProduct() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMaxDotProduct() {
         short acc = Short.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1004,6 +1078,7 @@ private static short shortMaxDotProduct() {
 
     // ---------short***Big ------------------------------------------------------------
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAndBig() {
         short acc = (short)0xFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1014,6 +1089,7 @@ private static short shortAndBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortOrBig() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1024,6 +1100,7 @@ private static short shortOrBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortXorBig() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1034,6 +1111,7 @@ private static short shortXorBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortAddBig() {
         short acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1044,6 +1122,7 @@ private static short shortAddBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMulBig() {
         short acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1054,6 +1133,7 @@ private static short shortMulBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMinBig() {
         short acc = Short.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1064,6 +1144,7 @@ private static short shortMinBig() {
     }
 
     @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
     private static short shortMaxBig() {
         short acc = Short.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1075,6 +1156,13 @@ private static short shortMaxBig() {
 
     // ---------int***Simple ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAndSimple() {
         int acc = 0xFFFFFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {

From 90691a85d7f9ff35e77edb0810db6fe994ba779f Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 17:17:13 +0200
Subject: [PATCH 22/39] int ir rules

---
 .../loopopts/superword/TestReductions.java    | 140 ++++++++++++++++++
 1 file changed, 140 insertions(+)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index bd8bd2359ed2a..eb9b8d82b03df 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1173,6 +1173,13 @@ private static int intAndSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intOrSimple() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1183,6 +1190,13 @@ private static int intOrSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intXorSimple() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1193,6 +1207,13 @@ private static int intXorSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAddSimple() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1203,6 +1224,13 @@ private static int intAddSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMulSimple() {
         int acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1213,6 +1241,13 @@ private static int intMulSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMinSimple() {
         int acc = Integer.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1223,6 +1258,13 @@ private static int intMinSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMaxSimple() {
         int acc = Integer.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1234,6 +1276,13 @@ private static int intMaxSimple() {
 
     // ---------int***DotProduct ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAndDotProduct() {
         int acc = 0xFFFFFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1244,6 +1293,13 @@ private static int intAndDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intOrDotProduct() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1254,6 +1310,13 @@ private static int intOrDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intXorDotProduct() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1264,6 +1327,13 @@ private static int intXorDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAddDotProduct() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1274,6 +1344,13 @@ private static int intAddDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMulDotProduct() {
         int acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1284,6 +1361,13 @@ private static int intMulDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMinDotProduct() {
         int acc = Integer.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1294,6 +1378,13 @@ private static int intMinDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMaxDotProduct() {
         int acc = Integer.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1305,6 +1396,13 @@ private static int intMaxDotProduct() {
 
     // ---------int***Big ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAndBig() {
         int acc = 0xFFFFFFFF; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1315,6 +1413,13 @@ private static int intAndBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intOrBig() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1325,6 +1430,13 @@ private static int intOrBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intXorBig() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1335,6 +1447,13 @@ private static int intXorBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intAddBig() {
         int acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1345,6 +1464,13 @@ private static int intAddBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMulBig() {
         int acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1355,6 +1481,13 @@ private static int intMulBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMinBig() {
         int acc = Integer.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1365,6 +1498,13 @@ private static int intMinBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static int intMaxBig() {
         int acc = Integer.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {

From 8ecbf717b5cf8ec8da97abf2335239a3bc656990 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 17:24:00 +0200
Subject: [PATCH 23/39] first long ir

---
 .../jtreg/compiler/loopopts/superword/TestReductions.java  | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index eb9b8d82b03df..49880b0199b9c 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1516,6 +1516,13 @@ private static int intMaxBig() {
 
     // ---------long***Simple ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndSimple() {
         long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
         for (int i = 0; i < SIZE; i++) {

From d3dad214a76cad179822f05203da7526aa67581c Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 17:58:59 +0200
Subject: [PATCH 24/39] long ir rules

---
 .../loopopts/superword/TestReductions.java    | 156 ++++++++++++++++++
 1 file changed, 156 insertions(+)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 49880b0199b9c..a04233d4651aa 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1533,6 +1533,14 @@ private static long longAndSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longOrSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1543,6 +1551,14 @@ private static long longOrSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longXorSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1553,6 +1569,14 @@ private static long longXorSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longAddSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1563,6 +1587,14 @@ private static long longAddSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMulSimple() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1573,6 +1605,14 @@ private static long longMulSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMinSimple() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1583,6 +1623,14 @@ private static long longMinSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMaxSimple() {
         long acc = Long.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1594,6 +1642,14 @@ private static long longMaxSimple() {
 
     // ---------long***DotProduct ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longAndDotProduct() {
         long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1604,6 +1660,14 @@ private static long longAndDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longOrDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1614,6 +1678,14 @@ private static long longOrDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longXorDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1624,6 +1696,14 @@ private static long longXorDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longAddDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1634,6 +1714,14 @@ private static long longAddDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMulDotProduct() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1644,6 +1732,14 @@ private static long longMulDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMinDotProduct() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1654,6 +1750,13 @@ private static long longMinDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxDotProduct() {
         long acc = Long.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1665,6 +1768,13 @@ private static long longMaxDotProduct() {
 
     // ---------long***Big ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndBig() {
         long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1675,6 +1785,14 @@ private static long longAndBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longOrBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1685,6 +1803,14 @@ private static long longOrBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longXorBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1695,6 +1821,14 @@ private static long longXorBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longAddBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1705,6 +1839,13 @@ private static long longAddBig() {
     }
 
     @Test
+    //@IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+    //              IRNode.MUL_REDUCTION_VL, "> 0",
+    //              IRNode.MUL_VL,           "> 0"},
+    //    applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+    //    applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L)
+    // TODO: investigate, file report / issue.
     private static long longMulBig() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1715,6 +1856,14 @@ private static long longMulBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+
     private static long longMinBig() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1725,6 +1874,13 @@ private static long longMinBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxBig() {
         long acc = Long.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {

From b8251094af5e8275d3e42b23db1ae552cea507d6 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Fri, 24 Oct 2025 18:26:22 +0200
Subject: [PATCH 25/39] floating add ir test

---
 .../loopopts/superword/TestReductions.java    | 48 +++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index a04233d4651aa..1e4b54dbdb5e3 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1892,6 +1892,16 @@ private static long longMaxBig() {
 
     // ---------float***Simple ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n adds
+    // Vector: n loads + n adds + n extract (sequential order of reduction)
     private static float floatAddSimple() {
         float acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1933,6 +1943,13 @@ private static float floatMaxSimple() {
 
     // ---------float***DotProduct ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddDotProduct() {
         float acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1974,6 +1991,13 @@ private static float floatMaxDotProduct() {
 
     // ---------float***Big ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddBig() {
         float acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2015,6 +2039,16 @@ private static float floatMaxBig() {
 
     // ---------double***Simple ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VD,          "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n adds
+    // Vector: n loads + n adds + n extract (sequential order of reduction)
     private static double doubleAddSimple() {
         double acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2056,6 +2090,13 @@ private static double doubleMaxSimple() {
 
     // ---------double***DotProduct ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VD,          "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddDotProduct() {
         double acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2097,6 +2138,13 @@ private static double doubleMaxDotProduct() {
 
     // ---------double***Big ------------------------------------------------------------
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddBig() {
         double acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {

From 0de4a974b775b6c5ac4a4432852a0833c317fae3 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 25 Oct 2025 13:15:59 +0200
Subject: [PATCH 26/39] double ir tests

---
 .../loopopts/superword/TestReductions.java    | 154 +++++++++++++++---
 1 file changed, 135 insertions(+), 19 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 1e4b54dbdb5e3..da58bf744474c 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1540,7 +1540,6 @@ private static long longAndSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longOrSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1558,7 +1557,6 @@ private static long longOrSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longXorSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1576,7 +1574,6 @@ private static long longXorSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longAddSimple() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1594,7 +1591,6 @@ private static long longAddSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMulSimple() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1612,7 +1608,6 @@ private static long longMulSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMinSimple() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1630,7 +1625,6 @@ private static long longMinSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMaxSimple() {
         long acc = Long.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1649,7 +1643,6 @@ private static long longMaxSimple() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longAndDotProduct() {
         long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1667,7 +1660,6 @@ private static long longAndDotProduct() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longOrDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1685,7 +1677,6 @@ private static long longOrDotProduct() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longXorDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1703,7 +1694,6 @@ private static long longXorDotProduct() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longAddDotProduct() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1721,7 +1711,6 @@ private static long longAddDotProduct() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMulDotProduct() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1739,7 +1728,6 @@ private static long longMulDotProduct() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMinDotProduct() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1792,7 +1780,6 @@ private static long longAndBig() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longOrBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1810,7 +1797,6 @@ private static long longOrBig() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longXorBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1828,7 +1814,6 @@ private static long longXorBig() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longAddBig() {
         long acc = 0; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1863,7 +1848,6 @@ private static long longMulBig() {
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-
     private static long longMinBig() {
         long acc = Long.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1912,6 +1896,16 @@ private static float floatAddSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n mul
+    // Vector: n loads + n mul + n extract (sequential order of reduction)
     private static float floatMulSimple() {
         float acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1922,6 +1916,13 @@ private static float floatMulSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMinSimple() {
         float acc = Float.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1932,6 +1933,13 @@ private static float floatMinSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMaxSimple() {
         float acc = Float.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1960,6 +1968,13 @@ private static float floatAddDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulDotProduct() {
         float acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1970,6 +1985,13 @@ private static float floatMulDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMinDotProduct() {
         float acc = Float.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1980,6 +2002,13 @@ private static float floatMinDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMaxDotProduct() {
         float acc = Float.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2008,6 +2037,13 @@ private static float floatAddBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulBig() {
         float acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2018,6 +2054,13 @@ private static float floatMulBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMinBig() {
         float acc = Float.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2028,6 +2071,13 @@ private static float floatMinBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMaxBig() {
         float acc = Float.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2039,9 +2089,9 @@ private static float floatMaxBig() {
 
     // ---------double***Simple ------------------------------------------------------------
     @Test
-    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
-                  IRNode.ADD_REDUCTION_V, "> 0",
-                  IRNode.ADD_VD,          "= 0"},
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0",
+                  IRNode.ADD_VD,           "= 0"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
@@ -2059,6 +2109,16 @@ private static double doubleAddSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "= 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n mul
+    // Vector: n loads + n mul + n extract (sequential order of reduction)
     private static double doubleMulSimple() {
         double acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2069,6 +2129,13 @@ private static double doubleMulSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMinSimple() {
         double acc = Double.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2079,6 +2146,13 @@ private static double doubleMinSimple() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMaxSimple() {
         double acc = Double.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2107,6 +2181,13 @@ private static double doubleAddDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulDotProduct() {
         double acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2117,6 +2198,13 @@ private static double doubleMulDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMinDotProduct() {
         double acc = Double.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2127,6 +2215,13 @@ private static double doubleMinDotProduct() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMaxDotProduct() {
         double acc = Double.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2155,6 +2250,13 @@ private static double doubleAddBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulBig() {
         double acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2165,6 +2267,13 @@ private static double doubleMulBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMinBig() {
         double acc = Double.MAX_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -2175,6 +2284,13 @@ private static double doubleMinBig() {
     }
 
     @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMaxBig() {
         double acc = Double.MIN_VALUE; // neutral element
         for (int i = 0; i < SIZE; i++) {

From ff4c1dad6ccea5270e061676d718ff866143aba2 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 08:38:20 +0100
Subject: [PATCH 27/39] AVX2 exception for min/max long

---
 .../loopopts/superword/TestReductions.java    | 30 +++++++++++++++----
 1 file changed, 24 insertions(+), 6 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index da58bf744474c..cca7b37500769 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1604,8 +1604,11 @@ private static long longMulSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinSimple() {
@@ -1621,8 +1624,11 @@ private static long longMinSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxSimple() {
@@ -1724,8 +1730,11 @@ private static long longMulDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinDotProduct() {
@@ -1741,8 +1750,11 @@ private static long longMinDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxDotProduct() {
@@ -1844,8 +1856,11 @@ private static long longMulBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinBig() {
@@ -1861,8 +1876,11 @@ private static long longMinBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxBig() {

From af8dd438b7373a2bb9c3d967b3927a3781520972 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 08:59:55 +0100
Subject: [PATCH 28/39] avx2 exception for mul long

---
 .../compiler/loopopts/superword/TestReductions.java    | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index cca7b37500769..d45c4afe91e3e 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1587,8 +1587,11 @@ private static long longAddSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.MUL_REDUCTION_VL, "> 0",
                   IRNode.MUL_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370673
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMulSimple() {
@@ -1713,8 +1716,11 @@ private static long longAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.MUL_REDUCTION_VL, "> 0",
                   IRNode.MUL_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370673
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMulDotProduct() {

From d457b0461732965dc5a8b57afe834c746e45da10 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 09:09:28 +0100
Subject: [PATCH 29/39] AVX=0 ir rule adjustments

---
 .../loopopts/superword/TestReductions.java    | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index d45c4afe91e3e..774abdf339440 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1943,7 +1943,7 @@ private static float floatMulSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -1960,7 +1960,7 @@ private static float floatMinSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2012,7 +2012,7 @@ private static float floatMulDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2029,7 +2029,7 @@ private static float floatMinDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2081,7 +2081,7 @@ private static float floatMulBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2098,7 +2098,7 @@ private static float floatMinBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2156,7 +2156,7 @@ private static double doubleMulSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2173,7 +2173,7 @@ private static double doubleMinSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2225,7 +2225,7 @@ private static double doubleMulDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2242,7 +2242,7 @@ private static double doubleMinDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2294,7 +2294,7 @@ private static double doubleMulBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
@@ -2311,7 +2311,7 @@ private static double doubleMinBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})

From 180d066cc5dc00b8e35aeca526da94af39b2760c Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 10:33:43 +0100
Subject: [PATCH 30/39] fix asimd add/mul f/d rules

---
 .../loopopts/superword/TestReductions.java    | 78 +++++++++++++++----
 1 file changed, 63 insertions(+), 15 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 774abdf339440..c0f068df29c9e 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -22,7 +22,7 @@
  */
 
 /*
- * @test
+ * @test id=no-vectorization
  * @bug 8340093
  * @summary Test vectorization of reduction loops.
  * @library /test/lib /
@@ -30,7 +30,7 @@
  */
 
 /*
- * @test
+ * @test id=vanilla
  * @bug 8340093
  * @summary Test vectorization of reduction loops.
  * @library /test/lib /
@@ -38,7 +38,7 @@
  */
 
 /*
- * @test
+ * @test id=force-vectorization
  * @bug 8340093
  * @summary Test vectorization of reduction loops.
  * @library /test/lib /
@@ -1903,8 +1903,12 @@ private static long longMaxBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -1923,8 +1927,12 @@ private static float floatAddSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -1978,8 +1986,12 @@ private static float floatMaxSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddDotProduct() {
@@ -1995,8 +2007,12 @@ private static float floatAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulDotProduct() {
@@ -2047,8 +2063,12 @@ private static float floatMaxDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VF,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatAddBig() {
@@ -2064,8 +2084,12 @@ private static float floatAddBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
                   IRNode.MUL_REDUCTION_VF, "> 0",
                   IRNode.MUL_VF,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_F,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static float floatMulBig() {
@@ -2116,8 +2140,12 @@ private static float floatMaxBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.ADD_REDUCTION_VD, "> 0",
                   IRNode.ADD_VD,           "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2136,8 +2164,12 @@ private static double doubleAddSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
     // Not considered profitable by cost model, but if forced we can vectorize.
@@ -2191,8 +2223,12 @@ private static double doubleMaxSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VD,          "= 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddDotProduct() {
@@ -2208,8 +2244,12 @@ private static double doubleAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulDotProduct() {
@@ -2260,8 +2300,12 @@ private static double doubleMaxDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
                   IRNode.ADD_REDUCTION_V, "> 0",
                   IRNode.ADD_VD,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleAddBig() {
@@ -2277,8 +2321,12 @@ private static double doubleAddBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
                   IRNode.MUL_REDUCTION_VD, "> 0",
                   IRNode.MUL_VD,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_D,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static double doubleMulBig() {

From c2768d865a1cda9618a1fff4e9d9239bbe9bac09 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 11:03:13 +0100
Subject: [PATCH 31/39] fix some asimd ir rules

---
 .../loopopts/superword/TestReductions.java    | 63 ++++++++++++++++---
 1 file changed, 53 insertions(+), 10 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index c0f068df29c9e..8f9cabeaf29d8 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1594,6 +1594,16 @@ private static long longAddSimple() {
     // I think this could vectorize, but currently does not. Filed: JDK-8370673
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    // TODO: it seems we support this after all on NEON? Investigate!
+    //       Concerning: reduction is done in scalar. But we also have
+    //       a scalar element-wise operation or MulVL ... but it is not
+    //       recommended that it is used, see:
+    //       Matcher::match_rule_supported_auto_vectorization
+    //       This probably explains the slowdown we see in the benchmark!
+    //       We should thus also revise all other occurances of MulVL.
+    //       Maybe we also have to fix the code that moves the reduction
+    //       out of the loop, because it seems to introduce the MulVL,
+    //       but probably should not.
     private static long longMulSimple() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1648,8 +1658,11 @@ private static long longMaxSimple() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.AND_REDUCTION_V, "> 0",
                   IRNode.AND_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndDotProduct() {
@@ -1665,8 +1678,11 @@ private static long longAndDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
                   IRNode.OR_REDUCTION_V, "> 0",
                   IRNode.OR_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longOrDotProduct() {
@@ -1684,6 +1700,9 @@ private static long longOrDotProduct() {
                   IRNode.XOR_VL,          "> 0"},
         applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longXorDotProduct() {
@@ -1716,11 +1735,14 @@ private static long longAddDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.MUL_REDUCTION_VL, "> 0",
                   IRNode.MUL_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"},
+        applyIfCPUFeature = {"avx512dq", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370673
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // MulVL is not implemented on NEON, so we also not have the reduction.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMulDotProduct() {
@@ -1736,11 +1758,14 @@ private static long longMulDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIfCPUFeature = {"avx512", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinDotProduct() {
@@ -1756,11 +1781,14 @@ private static long longMinDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIfCPUFeature = {"avx512", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxDotProduct() {
@@ -1777,8 +1805,11 @@ private static long longMaxDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.AND_REDUCTION_V, "> 0",
                   IRNode.AND_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndBig() {
@@ -1794,8 +1825,11 @@ private static long longAndBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
                   IRNode.OR_REDUCTION_V, "> 0",
                   IRNode.OR_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longOrBig() {
@@ -1811,8 +1845,11 @@ private static long longOrBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.XOR_REDUCTION_V, "> 0",
                   IRNode.XOR_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longXorBig() {
@@ -1862,11 +1899,14 @@ private static long longMulBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MIN_REDUCTION_V, "> 0",
                   IRNode.MIN_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIfCPUFeature = {"avx512", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinBig() {
@@ -1882,11 +1922,14 @@ private static long longMinBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.MAX_REDUCTION_V, "> 0",
                   IRNode.MAX_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIfCPUFeature = {"avx512", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxBig() {

From 1edb758ba6820bccae887b1e020cf4988d24265c Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 27 Oct 2025 13:27:23 +0100
Subject: [PATCH 32/39] fix ir test a bit more

---
 .../loopopts/superword/TestReductions.java    | 67 +++++++++++++------
 1 file changed, 46 insertions(+), 21 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 8f9cabeaf29d8..9cf9dc1e25b2c 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -105,7 +105,8 @@ public static void main(String[] args) {
         switch (args[0]) {
             case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); }
             case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); }
-            case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2"); }
+            // Note: increasing the node count limit also helps in some cases.
+            case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2", "-XX:LoopUnrollLimit=1000"); }
             default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
         };
         framework.start();
@@ -1594,16 +1595,9 @@ private static long longAddSimple() {
     // I think this could vectorize, but currently does not. Filed: JDK-8370673
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-    // TODO: it seems we support this after all on NEON? Investigate!
-    //       Concerning: reduction is done in scalar. But we also have
-    //       a scalar element-wise operation or MulVL ... but it is not
-    //       recommended that it is used, see:
-    //       Matcher::match_rule_supported_auto_vectorization
-    //       This probably explains the slowdown we see in the benchmark!
-    //       We should thus also revise all other occurances of MulVL.
-    //       Maybe we also have to fix the code that moves the reduction
-    //       out of the loop, because it seems to introduce the MulVL,
-    //       but probably should not.
+    // Note: we get a performance regression for NEON, because it uses a
+    // scalar implementation for the reduction.
+    // Filed: JDK-8370686
     private static long longMulSimple() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1663,6 +1657,7 @@ private static long longMaxSimple() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndDotProduct() {
@@ -1683,6 +1678,7 @@ private static long longAndDotProduct() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longOrDotProduct() {
@@ -1698,11 +1694,12 @@ private static long longOrDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
                   IRNode.XOR_REDUCTION_V, "> 0",
                   IRNode.XOR_VL,          "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longXorDotProduct() {
@@ -1718,8 +1715,12 @@ private static long longXorDotProduct() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.ADD_REDUCTION_VL, "> 0",
                   IRNode.ADD_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAddDotProduct() {
@@ -1743,6 +1744,7 @@ private static long longAddDotProduct() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // MulVL is not implemented on NEON, so we also not have the reduction.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMulDotProduct() {
@@ -1766,6 +1768,7 @@ private static long longMulDotProduct() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinDotProduct() {
@@ -1789,6 +1792,7 @@ private static long longMinDotProduct() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxDotProduct() {
@@ -1810,6 +1814,7 @@ private static long longMaxDotProduct() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAndBig() {
@@ -1830,6 +1835,7 @@ private static long longAndBig() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longOrBig() {
@@ -1850,6 +1856,7 @@ private static long longOrBig() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longXorBig() {
@@ -1865,8 +1872,12 @@ private static long longXorBig() {
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.ADD_REDUCTION_VL, "> 0",
                   IRNode.ADD_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIfCPUFeature = {"sse4.1", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longAddBig() {
@@ -1879,13 +1890,25 @@ private static long longAddBig() {
     }
 
     @Test
-    //@IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
-    //              IRNode.MUL_REDUCTION_VL, "> 0",
-    //              IRNode.MUL_VL,           "> 0"},
-    //    applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
-    //    applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
-    @IR(failOn = IRNode.LOAD_VECTOR_L)
-    // TODO: investigate, file report / issue.
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"},
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0",
+                      "LoopUnrollLimit", ">= 1000"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0",
+                      "LoopUnrollLimit", "< 1000"})
+    // Increasing the body limit seems to help. Filed for investigation: JDK-8370685
+    // If you can eliminate this exception for LoopUnrollLimit, please remove
+    // the flag completely from the test, also the "addFlags" at the top.
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // MulVL is not implemented on NEON, so we also not have the reduction.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMulBig() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {
@@ -1907,6 +1930,7 @@ private static long longMulBig() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMinBig() {
@@ -1930,6 +1954,7 @@ private static long longMinBig() {
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"asimd", "true"})
     // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
+    // Filed: JDK-8370686
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
     private static long longMaxBig() {

From 18a88983996cc47942f6e5c563093c865259901a Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 29 Oct 2025 11:40:53 +0100
Subject: [PATCH 33/39] fix aarch64 long mul reduction perf issue

---
 src/hotspot/cpu/aarch64/aarch64_vector.ad    | 14 ++++++++++----
 src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 14 ++++++++++----
 src/hotspot/share/opto/vtransform.cpp        |  2 +-
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 3379041b2ccac..9809d096233a3 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -129,18 +129,24 @@ source %{
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
           // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
         return false;
       }
     }
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 6d296cbdb3ac3..a9f42e1bc08c9 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -119,18 +119,24 @@ source %{
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
           // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
         return false;
       }
     }
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 954a915cbaea4..c245206b609d5 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -1242,7 +1242,7 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
   const BasicType bt = element_basic_type();
   const int ropc     = vector_reduction_opcode();
   const int vopc     = VectorNode::opcode(sopc, bt);
-  if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
+  if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
     DEBUG_ONLY( this->print(); )
     assert(false, "do not have normal vector op for this reduction");
     return false; // not implemented

From 2bd9c94dd7d30a571e6972a8dc51e59022085b6d Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 29 Oct 2025 13:44:48 +0100
Subject: [PATCH 34/39] rm assert

---
 src/hotspot/share/opto/vtransform.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index c245206b609d5..2cfca67e3f697 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -1243,9 +1243,9 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
   const int ropc     = vector_reduction_opcode();
   const int vopc     = VectorNode::opcode(sopc, bt);
   if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
-    DEBUG_ONLY( this->print(); )
-    assert(false, "do not have normal vector op for this reduction");
-    return false; // not implemented
+    // The element-wise vector operation needed for the vector accumulator
+    // is not implemented / supported.
+    return false;
   }
 
   // Traverse up the chain of non strict order reductions, checking that it loops

From a8d31d756ea378179044b4f29ef302ba6852f4e3 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 29 Oct 2025 14:55:52 +0100
Subject: [PATCH 35/39] fix IR rules for aarch64 NEON

---
 .../loopopts/superword/TestReductions.java     | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
index 9cf9dc1e25b2c..1cd5cfa1e750c 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -1587,17 +1587,25 @@ private static long longAddSimple() {
     @Test
     @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
                   IRNode.MUL_REDUCTION_VL, "> 0",
-                  IRNode.MUL_VL,           "> 0"},
-        applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"},
+                  IRNode.MUL_VL,           "> 0"}, // vector accumulator
+        applyIfCPUFeature = {"avx512dq", "true"},
         applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
     // I think this could vectorize, but currently does not. Filed: JDK-8370673
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "= 0"}, // Reduction NOT moved out of loop
+        applyIfCPUFeatureOr = {"asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    // Note: NEON does not support MulVL for auto vectorization. There is
+    //       a scalarized implementation, but that is not profitable for
+    //       auto vectorization in almost all cases, and would not be
+    //       profitable here at any rate.
+    //       Hence, we have to keep the reduction inside the loop, and
+    //       cannot use the MulVL as the vector accumulator.
     @IR(failOn = IRNode.LOAD_VECTOR_L,
         applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
-    // Note: we get a performance regression for NEON, because it uses a
-    // scalar implementation for the reduction.
-    // Filed: JDK-8370686
     private static long longMulSimple() {
         long acc = 1; // neutral element
         for (int i = 0; i < SIZE; i++) {

From 3f7ef58ef15f1eac11dbcff85d5fbc830ecd715e Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 29 Oct 2025 15:09:10 +0100
Subject: [PATCH 36/39] simplify cost-model impl

---
 src/hotspot/share/opto/matcher.cpp       | 27 ------------------------
 src/hotspot/share/opto/matcher.hpp       |  5 -----
 src/hotspot/share/opto/vectorization.cpp | 17 ++++++++++++---
 3 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp
index 3d090210de517..c63cefe7ac201 100644
--- a/src/hotspot/share/opto/matcher.cpp
+++ b/src/hotspot/share/opto/matcher.cpp
@@ -2678,33 +2678,6 @@ void Matcher::specialize_generic_vector_operands() {
   }
 }
 
-// For now, we use unit cost. We might refine that in the future.
-// If needed, we could also use platform specific costs, if the
-// default here is not accurate enough.
-float Matcher::cost_for_scalar(int opcode) {
-  return 1;
-}
-
-// For now, we use unit cost. We might refine that in the future.
-// If needed, we could also use platform specific costs, if the
-// default here is not accurate enough.
-float Matcher::cost_for_vector(int opcode, int vlen, BasicType bt) {
-  return 1;
-}
-
-// For now, we use unit cost. We might refine that in the future.
-// If needed, we could also use platform specific costs, if the
-// default here is not accurate enough.
-float Matcher::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) {
-  if (requires_strict_order) {
-    // Linear: shuffle and reduce
-    return 2 * vlen;
-  } else {
-    // Recursive: shuffle and reduce
-    return 2 * exact_log2(vlen);
-  }
-}
-
 uint Matcher::vector_length(const Node* n) {
   const TypeVect* vt = n->bottom_type()->is_vect();
   return vt->length();
diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp
index 42e75e6db0182..e4396b423ac0e 100644
--- a/src/hotspot/share/opto/matcher.hpp
+++ b/src/hotspot/share/opto/matcher.hpp
@@ -333,11 +333,6 @@ class Matcher : public PhaseTransform {
 
   static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen);
 
-  // Cost-Model for Auto-Vectorization
-  static float cost_for_scalar(int opcode);
-  static float cost_for_vector(int opcode, int vlen, BasicType bt);
-  static float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order);
-
   static const RegMask* predicate_reg_mask(void);
 
   // Vector width in bytes
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 15e4248cf409a..d071781e11e9e 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -598,8 +598,11 @@ float VLoopAnalyzer::cost() const {
   return sum;
 }
 
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
 float VLoopAnalyzer::cost_for_scalar(int opcode) const {
-  float c = Matcher::cost_for_scalar(opcode);
+  float c = 1;
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
     tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
@@ -608,8 +611,11 @@ float VLoopAnalyzer::cost_for_scalar(int opcode) const {
   return c;
 }
 
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
 float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
-  float c = Matcher::cost_for_vector(opcode, vlen, bt);
+  float c = 1;
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
     tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
@@ -619,8 +625,13 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
   return c;
 }
 
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
 float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
-  float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order);
+  // Each reduction is composed of multiple instructions, each estimated with a unit cost.
+  //                                Linear: shuffle and reduce    Recursive: shuffle and reduce
+  float c = requires_strict_order ? 2 * vlen                    : 2 * exact_log2(vlen);
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
     tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",

From 22dab5a4d0a07378b0718dd67e5e470cc37372e9 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 3 Nov 2025 14:56:06 +0100
Subject: [PATCH 37/39] Update src/hotspot/share/opto/vectorization.cpp

Co-authored-by: Hannes Greule <SirYwell@users.noreply.github.com>
---
 src/hotspot/share/opto/vectorization.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index d071781e11e9e..e231515166173 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -541,7 +541,7 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
-// Cost-model heuristic for nodes that do not contribute to computatinal
+// Cost-model heuristic for nodes that do not contribute to computational
 // cost inside the loop.
 bool VLoopAnalyzer::has_zero_cost(Node* n) const {
   // Outside body?

From d79df4fce4c0464bb97b7b66a5ee490a832f6230 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 3 Nov 2025 17:09:21 +0100
Subject: [PATCH 38/39] More comments for SirYwell

---
 src/hotspot/share/opto/vectorization.cpp | 3 ++-
 src/hotspot/share/opto/vtransform.cpp    | 4 ++++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index e231515166173..ef83358719657 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -625,7 +625,8 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
   return c;
 }
 
-// For now, we use unit cost. We might refine that in the future.
+// For now, we use unit cost, i.e. we count the number of backend instructions
+// that the vtnode will use. We might refine that in the future.
 // If needed, we could also use platform specific costs, if the
 // default here is not accurate enough.
 float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 2cfca67e3f697..379768d8172bf 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -249,6 +249,10 @@ float VTransformGraph::cost() const {
   }
 #endif
 
+  // We only want to count the cost of nodes that are in the loop.
+  // This is especially important for cases where we were able to move
+  // some nodes outside the loop during VTransform::optimize, e.g.:
+  // VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
   ResourceMark rm;
   VectorSet in_loop; // vtn->_idx -> bool
   mark_vtnodes_in_loop(in_loop);

From 23906b814a01d910c089eb06e0733a5222fb4eb1 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 5 Nov 2025 10:45:28 +0100
Subject: [PATCH 39/39] rename cost methods for Vladimir K

---
 src/hotspot/share/opto/superword.cpp     |  4 +--
 src/hotspot/share/opto/vectorization.cpp | 12 ++++-----
 src/hotspot/share/opto/vectorization.hpp |  8 +++---
 src/hotspot/share/opto/vtransform.cpp    | 34 ++++++++++++------------
 src/hotspot/share/opto/vtransform.hpp    |  4 +--
 5 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 4af48667c3bc7..dfac5240b504f 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -1930,8 +1930,8 @@ bool VTransform::is_profitable() const {
   if (has_store_to_load_forwarding_failure()) { return false; }
 
   // Cost-model
-  float scalar_cost = _vloop_analyzer.cost();
-  float vector_cost = cost();
+  float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
+  float vector_cost = cost_for_vector_loop();
 #ifndef PRODUCT
   if (_trace._info) {
     tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index ef83358719657..98f3d79c9f5ce 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -569,10 +569,10 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const {
 }
 
 // Compute the cost over all operations in the (scalar) loop.
-float VLoopAnalyzer::cost() const {
+float VLoopAnalyzer::cost_for_scalar_loop() const {
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
-    tty->print_cr("\nVLoopAnalyzer::cost:");
+    tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
   }
 #endif
 
@@ -580,7 +580,7 @@ float VLoopAnalyzer::cost() const {
   for (int j = 0; j < body().body().length(); j++) {
     Node* n = body().body().at(j);
     if (!has_zero_cost(n)) {
-      float c = cost_for_scalar(n->Opcode());
+      float c = cost_for_scalar_node(n->Opcode());
       sum += c;
 #ifndef PRODUCT
       if (_vloop.is_trace_cost_verbose()) {
@@ -601,7 +601,7 @@ float VLoopAnalyzer::cost() const {
 // For now, we use unit cost. We might refine that in the future.
 // If needed, we could also use platform specific costs, if the
 // default here is not accurate enough.
-float VLoopAnalyzer::cost_for_scalar(int opcode) const {
+float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
   float c = 1;
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
@@ -614,7 +614,7 @@ float VLoopAnalyzer::cost_for_scalar(int opcode) const {
 // For now, we use unit cost. We might refine that in the future.
 // If needed, we could also use platform specific costs, if the
 // default here is not accurate enough.
-float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
+float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
   float c = 1;
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
@@ -629,7 +629,7 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const {
 // that the vtnode will use. We might refine that in the future.
 // If needed, we could also use platform specific costs, if the
 // default here is not accurate enough.
-float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
+float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
   // Each reduction is composed of multiple instructions, each estimated with a unit cost.
   //                                Linear: shuffle and reduce    Recursive: shuffle and reduce
   float c = requires_strict_order ? 2 * vlen                    : 2 * exact_log2(vlen);
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 419c29d6544a5..f7099b5b7c0a4 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -853,13 +853,13 @@ class VLoopAnalyzer : StackObj {
   const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
 
   // Compute the cost of the (scalar) body.
-  float cost() const;
+  float cost_for_scalar_loop() const;
   bool has_zero_cost(Node* n) const;
 
   // Cost-modeling with tracing.
-  float cost_for_scalar(int opcode) const;
-  float cost_for_vector(int opcode, int vlen, BasicType bt) const;
-  float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
+  float cost_for_scalar_node(int opcode) const;
+  float cost_for_vector_node(int opcode, int vlen, BasicType bt) const;
+  float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
 
 private:
   bool setup_submodules();
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 379768d8172bf..9fd6ad1089c55 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -241,11 +241,11 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
   }
 }
 
-float VTransformGraph::cost() const {
+float VTransformGraph::cost_for_vector_loop() const {
   assert(is_scheduled(), "must already be scheduled");
 #ifndef PRODUCT
   if (_vloop.is_trace_cost()) {
-    tty->print_cr("\nVTransformGraph::cost:");
+    tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
   }
 #endif
 
@@ -927,7 +927,7 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
 float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   // This is an identity transform, but loads and stores must be counted.
   assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
-  return vloop_analyzer.cost_for_scalar(_node->Opcode());
+  return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
 }
 
 VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
@@ -948,7 +948,7 @@ float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const
   if (vloop_analyzer.has_zero_cost(_node)) {
     return 0;
   } else {
-    return vloop_analyzer.cost_for_scalar(_node->Opcode());
+    return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
   }
 }
 
@@ -1005,7 +1005,7 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
 }
 
 float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_vector(Op_Replicate, _vlen, _element_type);
+  return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
 }
 
 VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
@@ -1016,7 +1016,7 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
 }
 
 float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_scalar(Op_ConvI2L);
+  return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
 }
 
 VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
@@ -1028,8 +1028,8 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
 
 float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
-  return vloop_analyzer.cost_for_scalar(Op_AndI) +
-         vloop_analyzer.cost_for_vector(shift_count_opc, _vlen, _element_bt);
+  return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
+         vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
 }
 
 VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
@@ -1048,7 +1048,7 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
 }
 
 float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_vector(Op_PopulateIndex, _vlen, _element_bt);
+  return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
 }
 
 VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
@@ -1063,7 +1063,7 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
 }
 
 float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type());
+  return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
 }
 
 VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1086,8 +1086,8 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
 
 float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
-  return vloop_analyzer.cost_for_vector(vopc, vector_length(), element_basic_type()) +
-         vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), T_INT);
+  return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
+         vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
 }
 
 VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1106,7 +1106,7 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
 }
 
 float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
-  return vloop_analyzer.cost_for_vector(Op_VectorReinterpret, vector_length(), element_basic_type());
+  return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
 }
 
 VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1123,7 +1123,7 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
 
 float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   assert(scalar_opcode() == Op_Bool, "");
-  return vloop_analyzer.cost_for_vector(Op_VectorMaskCmp, vector_length(), element_basic_type());
+  return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
 }
 
 VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1386,7 +1386,7 @@ float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) c
   BasicType bt = element_basic_type();
   int vopc = vector_reduction_opcode();
   bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
-  return vloop_analyzer.cost_for_vector_reduction(vopc, vlen, bt, requires_strict_order);
+  return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
 }
 
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1401,7 +1401,7 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
 float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   uint vlen    = vector_length();
   BasicType bt = element_basic_type();
-  return vloop_analyzer.cost_for_vector(Op_LoadVector, vlen, bt);
+  return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
 }
 
 VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
@@ -1436,7 +1436,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
 float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
   uint vlen    = vector_length();
   BasicType bt = element_basic_type();
-  return vloop_analyzer.cost_for_vector(Op_StoreVector, vlen, bt);
+  return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
 }
 
 VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index a887300806ce9..a30f0ff098faf 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -194,7 +194,7 @@ class VTransformGraph : public StackObj {
   void optimize(VTransform& vtransform);
   bool schedule();
   bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
-  float cost() const;
+  float cost_for_vector_loop() const;
   void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
 
 private:
@@ -259,7 +259,7 @@ class VTransform : public StackObj {
   void optimize() { return _graph.optimize(*this); }
   bool schedule() { return _graph.schedule(); }
   bool is_profitable() const;
-  float cost() const { return _graph.cost(); }
+  float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
   bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();