diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad
index 3379041b2ccac..9809d096233a3 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector.ad
+++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad
@@ -129,18 +129,24 @@ source %{
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
           // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
         return false;
       }
     }
diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
index 6d296cbdb3ac3..a9f42e1bc08c9 100644
--- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
+++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
@@ -119,18 +119,24 @@ source %{
   bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) {
     if (UseSVE == 0) {
       // These operations are not profitable to be vectorized on NEON, because no direct
-      // NEON instructions support them. But the match rule support for them is profitable for
-      // Vector API intrinsics.
+      // NEON instructions support them. They use multiple instructions which is more
+      // expensive in almost all cases where we would auto vectorize.
+      // But the match rule support for them is profitable for Vector API intrinsics.
       if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) ||
           (opcode == Op_VectorCastL2X && bt == T_FLOAT) ||
           (opcode == Op_CountLeadingZerosV && bt == T_LONG) ||
           (opcode == Op_CountTrailingZerosV && bt == T_LONG) ||
+          opcode == Op_MulVL ||
           // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only.
           // They are not suitable for auto-vectorization because the result would not conform
           // to the JLS, Section Evaluation Order.
+          // Note: we could implement sequential reductions for these reduction operators, but
+          //       this will still almost never lead to speedups, because the sequential
+          //       reductions are latency limited along the reduction chain, and not
+          //       throughput limited. This is unlike unordered reductions (associative op)
+          //       and element-wise ops which are usually throughput limited.
           opcode == Op_AddReductionVD || opcode == Op_AddReductionVF ||
-          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF ||
-          opcode == Op_MulVL) {
+          opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) {
         return false;
       }
     }
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 6ab1ff37de9fd..dfac5240b504f 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
            ),
   _vpointer_for_main_loop_alignment(nullptr),
   _aw_for_main_loop_alignment(0),
-  _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
-  _num_work_vecs(0),                                        // amount of vector work we have
-  _num_reductions(0)                                        // amount of reduction work we have
+  _do_vector_loop(phase()->C->do_vector_loop())             // whether to do vectorization/simd style
 {
 }
 
@@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() {
 
 // Remove packs that are not profitable.
 void SuperWord::filter_packs_for_profitable() {
-  // Count the number of reductions vs other vector ops, for the
-  // reduction profitability heuristic.
-  for (int i = 0; i < _packset.length(); i++) {
-    Node_List* pack = _packset.at(i);
-    Node* n = pack->at(0);
-    if (is_marked_reduction(n)) {
-      _num_reductions++;
-    } else {
-      _num_work_vecs++;
-    }
-  }
-
   // Remove packs that are not profitable
   auto filter = [&](const Node_List* pack) {
     return profitable(pack);
@@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const {
   if (p0 != nullptr) {
     int opc = p0->Opcode();
     if (is_marked_reduction(p0)) {
-      const Type *arith_type = p0->bottom_type();
-      // This heuristic predicts that 2-element reductions for INT/LONG are not
-      // profitable. This heuristic was added in JDK-8078563. The argument
-      // was that reductions are not just a single instruction, but multiple, and
-      // hence it is not directly clear that they are profitable. If we only have
-      // two elements per vector, then the performance gains from non-reduction
-      // vectors are at most going from 2 scalar instructions to 1 vector instruction.
-      // But a 2-element reduction vector goes from 2 scalar instructions to
-      // 3 instructions (1 shuffle and two reduction ops).
-      // However, this optimization assumes that these reductions stay in the loop
-      // which may not be true any more in most cases after the introduction of:
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-      bool is_two_element_int_or_long_reduction = (size == 2) &&
-                                                  (arith_type->basic_type() == T_INT ||
-                                                   arith_type->basic_type() == T_LONG);
-      if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) {
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-        return false;
-      }
+      const Type* arith_type = p0->bottom_type();
       retValue = ReductionNode::implemented(opc, size, arith_type->basic_type());
     } else if (VectorNode::is_convert_opcode(opc)) {
       retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0));
@@ -1772,26 +1734,6 @@ bool SuperWord::profitable(const Node_List* p) const {
       // The second input has to be the vector we wanted to reduce,
       // but it was not packed.
       return false;
-    } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) {
-      // This heuristic predicts that the reduction is not profitable.
-      // Reduction vectors can be expensive, because they require multiple
-      // operations to fold all the lanes together. Hence, vectorizing the
-      // reduction is not profitable on its own. Hence, we need a lot of
-      // other "work vectors" that deliver performance improvements to
-      // balance out the performance loss due to reductions.
-      // This heuristic is a bit simplistic, and assumes that the reduction
-      // vector stays in the loop. But in some cases, we can move the
-      // reduction out of the loop, replacing it with a single vector op.
-      // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
-      // Hence, this heuristic has room for improvement.
-#ifndef PRODUCT
-        if (is_trace_superword_rejections()) {
-          tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make");
-          tty->print_cr("  reduction profitable.");
-          tty->print_cr("  Can override with AutoVectorizationOverrideProfitability=2");
-        }
-#endif
-      return false;
     } else if (second_pk->size() != p->size()) {
       return false;
     }
@@ -1950,19 +1892,53 @@ bool SuperWord::do_vtransform() const {
   vtransform.optimize();
 
   if (!vtransform.schedule()) { return false; }
-  if (vtransform.has_store_to_load_forwarding_failure()) { return false; }
+
+  if (!vtransform.is_profitable()) { return false; }
+
+  vtransform.apply();
+  return true;
+}
+
+// Check Cost-Model, and other heuristics.
+// Can be overridden with AutoVectorizationOverrideProfitability.
+bool VTransform::is_profitable() const {
+  assert(_graph.is_scheduled(), "must already be scheduled");
 
   if (AutoVectorizationOverrideProfitability == 0) {
 #ifndef PRODUCT
-    if (is_trace_superword_any()) {
+    if (_trace._info) {
       tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0).");
     }
 #endif
     return false;
   }
 
-  vtransform.apply();
-  return true;
+  if (AutoVectorizationOverrideProfitability == 2) {
+#ifndef PRODUCT
+    if (_trace._info) {
+      tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2).");
+    }
+#endif
+    return true;
+  }
+
+  // Note: currently we only do throughput-based cost-modeling. In the future, we could
+  //       also implement latency-based cost-modeling and take store-to-load-forwarding
+  //       failures into account as the latency between the load and store. This would
+  //       allow a more precise tradeoff between the forwarding failure penalty versus
+  //       the vectorization gains.
+  if (has_store_to_load_forwarding_failure()) { return false; }
+
+  // Cost-model
+  float scalar_cost = _vloop_analyzer.cost_for_scalar_loop();
+  float vector_cost = cost_for_vector_loop();
+#ifndef PRODUCT
+  if (_trace._info) {
+    tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f",
+                  scalar_cost, vector_cost);
+  }
+#endif
+  return vector_cost < scalar_cost;
 }
 
 // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 118e0aa042c79..9654465220b9c 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -549,8 +549,6 @@ class SuperWord : public ResourceObj {
 
  private:
   bool           _do_vector_loop;  // whether to do vectorization/simd style
-  int            _num_work_vecs;   // Number of non memory vector operations
-  int            _num_reductions;  // Number of reduction expressions applied
 
   // Accessors
   Arena* arena()                   { return &_arena; }
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index d996173aeb43b..4f67aff9b0706 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -38,7 +38,7 @@
   flags(MEMORY_SLICES,              "Trace VLoopMemorySlices") \
   flags(BODY,                       "Trace VLoopBody") \
   flags(TYPES,                      "Trace VLoopTypes") \
-  flags(POINTERS,                   "Trace VLoopPointers") \
+  flags(POINTERS,                   "Trace VLoopVPointers") \
   flags(DEPENDENCY_GRAPH,           "Trace VLoopDependencyGraph") \
   flags(SW_ADJACENT_MEMOPS,         "Trace SuperWord::find_adjacent_memop_pairs") \
   flags(SW_REJECTIONS,              "Trace SuperWord rejections (non vectorizations)") \
@@ -47,6 +47,8 @@
   flags(SW_VERBOSE,                 "Trace SuperWord verbose (all SW tags enabled)") \
   flags(VTRANSFORM,                 "Trace VTransform Graph") \
   flags(OPTIMIZATION,               "Trace VTransform::optimize") \
+  flags(COST,                       "Trace cost of VLoop (scalar) and VTransform (vector)") \
+  flags(COST_VERBOSE,               "Trace like COST, but more verbose") \
   flags(ALIGN_VECTOR,               "Trace AlignVector") \
   flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \
   flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 5c4e15fdbb916..98f3d79c9f5ce 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() {
   int pointers_idx = 0;
   _body.for_each_mem([&] (MemNode* const mem, int bb_idx) {
     // Placement new: construct directly into the array.
-    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop);
+    ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes);
     _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx);
     pointers_idx++;
   });
@@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() {
   }
 }
 
+// Cost-model heuristic for nodes that do not contribute to computational
+// cost inside the loop.
+bool VLoopAnalyzer::has_zero_cost(Node* n) const {
+  // Outside body?
+  if (!_vloop.in_bb(n)) { return true; }
+
+  // Internal nodes of pointer expressions are most likely folded into
+  // the load / store and have no additional cost.
+  if (vpointers().is_in_pointer_expression(n)) { return true; }
+
+  // Not all AddP nodes can be detected in VPointer parsing, so
+  // we filter them out here.
+  // We don't want to explicitly model the cost of control flow,
+  // since we have the same CFG structure before and after
+  // vectorization: A loop head, a loop exit, with a backedge.
+  if (n->is_AddP() || // Pointer expression
+      n->is_CFG() ||  // CFG
+      n->is_Phi() ||  // CFG
+      n->is_Cmp() ||  // CFG
+      n->is_Bool()) { // CFG
+    return true;
+  }
+
+  // All other nodes have a non-zero cost.
+  return false;
+}
+
+// Compute the cost over all operations in the (scalar) loop.
+float VLoopAnalyzer::cost_for_scalar_loop() const {
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:");
+  }
+#endif
+
+  float sum = 0;
+  for (int j = 0; j < body().body().length(); j++) {
+    Node* n = body().body().at(j);
+    if (!has_zero_cost(n)) {
+      float c = cost_for_scalar_node(n->Opcode());
+      sum += c;
+#ifndef PRODUCT
+      if (_vloop.is_trace_cost_verbose()) {
+        tty->print_cr("  -> cost = %.2f for %d %s", c, n->_idx, n->Name());
+      }
+#endif
+    }
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
+  return sum;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_scalar_node(int opcode) const {
+  float c = 1;
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s", c, NodeClassNames[opcode]);
+  }
+#endif
+  return c;
+}
+
+// For now, we use unit cost. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const {
+  float c = 1;
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt));
+  }
+#endif
+  return c;
+}
+
+// For now, we use unit cost, i.e. we count the number of backend instructions
+// that the vtnode will use. We might refine that in the future.
+// If needed, we could also use platform specific costs, if the
+// default here is not accurate enough.
+float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const {
+  // Each reduction is composed of multiple instructions, each estimated with a unit cost.
+  //                                Linear: shuffle and reduce    Recursive: shuffle and reduce
+  float c = requires_strict_order ? 2 * vlen                    : 2 * exact_log2(vlen);
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s",
+                  c, NodeClassNames[opcode], vlen, type2name(bt),
+                  requires_strict_order ? "true" : "false");
+  }
+#endif
+  return c;
+}
+
 // Computing aliasing runtime check using init and last of main-loop
 // -----------------------------------------------------------------
 //
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index b1be52d531a51..f7099b5b7c0a4 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -209,6 +209,14 @@ class VLoop : public StackObj {
     return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION);
   }
 
+  bool is_trace_cost() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST);
+  }
+
+  bool is_trace_cost_verbose() const {
+    return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE);
+  }
+
   bool is_trace_speculative_runtime_checks() const {
     return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS);
   }
@@ -584,6 +592,32 @@ class VLoopTypes : public StackObj {
   const Type* container_type(Node* n) const;
 };
 
+// Mark all nodes from the loop that are part of any VPointer expression.
+class PointerExpressionNodes : public MemPointerParserCallback {
+private:
+  const VLoop&     _vloop;
+  const VLoopBody& _body;
+  VectorSet        _in_pointer_expression;
+
+public:
+  PointerExpressionNodes(Arena* arena,
+                         const VLoop& vloop,
+                         const VLoopBody& body) :
+    _vloop(vloop),
+    _body(body),
+    _in_pointer_expression(arena) {}
+
+  virtual void callback(Node* n) override {
+    if (!_vloop.in_bb(n)) { return; }
+    _in_pointer_expression.set(_body.bb_idx(n));
+  }
+
+  bool contains(const Node* n) const {
+    if (!_vloop.in_bb(n)) { return false; }
+    return _in_pointer_expression.test(_body.bb_idx(n));
+  }
+};
+
 // Submodule of VLoopAnalyzer.
 // We compute and cache the VPointer for every load and store.
 class VLoopVPointers : public StackObj {
@@ -599,6 +633,9 @@ class VLoopVPointers : public StackObj {
   // Map bb_idx -> index in _vpointers. -1 if not mapped.
   GrowableArray<int> _bb_idx_to_vpointer;
 
+  // Mark all nodes that are part of any pointers expression.
+  PointerExpressionNodes _pointer_expression_nodes;
+
 public:
   VLoopVPointers(Arena* arena,
                  const VLoop& vloop,
@@ -610,13 +647,18 @@ class VLoopVPointers : public StackObj {
     _bb_idx_to_vpointer(arena,
                         vloop.estimated_body_length(),
                         vloop.estimated_body_length(),
-                        -1) {}
+                        -1),
+    _pointer_expression_nodes(arena, _vloop, _body) {}
   NONCOPYABLE(VLoopVPointers);
 
   void compute_vpointers();
   const VPointer& vpointer(const MemNode* mem) const;
   NOT_PRODUCT( void print() const; )
 
+  bool is_in_pointer_expression(const Node* n) const {
+    return _pointer_expression_nodes.contains(n);
+  }
+
 private:
   void count_vpointers();
   void allocate_vpointers_array();
@@ -810,6 +852,15 @@ class VLoopAnalyzer : StackObj {
   const VLoopVPointers& vpointers()              const { return _vpointers; }
   const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; }
 
+  // Compute the cost of the (scalar) body.
+  float cost_for_scalar_loop() const;
+  bool has_zero_cost(Node* n) const;
+
+  // Cost-modeling with tracing.
+  float cost_for_scalar_node(int opcode) const;
+  float cost_for_vector_node(int opcode, int vlen, BasicType bt) const;
+  float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const;
+
 private:
   bool setup_submodules();
   VStatus setup_submodules_helper();
diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp
index 46e8f43cb657d..9fd6ad1089c55 100644
--- a/src/hotspot/share/opto/vtransform.cpp
+++ b/src/hotspot/share/opto/vtransform.cpp
@@ -186,6 +186,99 @@ int VTransformGraph::count_alive_vtnodes() const {
   return count;
 }
 
+// Find all nodes that in the loop, in a 2-phase process:
+// - First, find all nodes that are not before the loop:
+//   - loop-phis
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+// - Second, we find all nodes that are not after the loop:
+//   - backedges
+//   - loads and stores that are in the loop
+//   - and all their transitive uses.
+//
+// in_loop: vtn->_idx -> bool
+void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const {
+  assert(is_scheduled(), "must already be scheduled");
+
+  // Phase 1: find all nodes that are not before the loop.
+  VectorSet is_not_before_loop;
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    // Is vtn a loop-phi?
+    if (vtn->isa_LoopPhi() != nullptr ||
+        vtn->is_load_or_store_in_loop()) {
+      is_not_before_loop.set(vtn->_idx);
+      continue;
+    }
+    // Or one of its transitive uses?
+    for (uint j = 0; j < vtn->req(); j++) {
+      VTransformNode* def = vtn->in_req(j);
+      if (def != nullptr && is_not_before_loop.test(def->_idx)) {
+        is_not_before_loop.set(vtn->_idx);
+        break;
+      }
+    }
+  }
+
+  // Phase 2: find all nodes that are not after the loop.
+  for (int i = _schedule.length()-1; i >= 0; i--) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!is_not_before_loop.test(vtn->_idx)) { continue; }
+    // Is load or store?
+    if (vtn->is_load_or_store_in_loop()) {
+        in_loop.set(vtn->_idx);
+        continue;
+    }
+    for (uint i = 0; i < vtn->out_strong_edges(); i++) {
+      VTransformNode* use = vtn->out_strong_edge(i);
+      // Or is vtn a backedge or one of its transitive defs?
+      if (in_loop.test(use->_idx) ||
+          use->isa_LoopPhi() != nullptr) {
+        in_loop.set(vtn->_idx);
+        break;
+      }
+    }
+  }
+}
+
+float VTransformGraph::cost_for_vector_loop() const {
+  assert(is_scheduled(), "must already be scheduled");
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("\nVTransformGraph::cost_for_vector_loop:");
+  }
+#endif
+
+  // We only want to count the cost of nodes that are in the loop.
+  // This is especially important for cases where we were able to move
+  // some nodes outside the loop during VTransform::optimize, e.g.:
+  // VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop
+  ResourceMark rm;
+  VectorSet in_loop; // vtn->_idx -> bool
+  mark_vtnodes_in_loop(in_loop);
+
+  float sum = 0;
+  for (int i = 0; i < _schedule.length(); i++) {
+    VTransformNode* vtn = _schedule.at(i);
+    if (!in_loop.test(vtn->_idx)) { continue; }
+    float c = vtn->cost(_vloop_analyzer);
+    sum += c;
+#ifndef PRODUCT
+    if (c != 0 && _vloop.is_trace_cost_verbose()) {
+      tty->print("  -> cost = %.2f for ", c);
+      vtn->print();
+    }
+#endif
+  }
+
+#ifndef PRODUCT
+  if (_vloop.is_trace_cost()) {
+    tty->print_cr("  total_cost = %.2f", sum);
+  }
+#endif
+  return sum;
+}
+
 #ifndef PRODUCT
 void VTransformGraph::trace_schedule_cycle(const GrowableArray<VTransformNode*>& stack,
                                            const VectorSet& pre_visited,
@@ -831,6 +924,12 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app
   }
 }
 
+float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // This is an identity transform, but loads and stores must be counted.
+  assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted");
+  return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
+}
+
 VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const {
   apply_vtn_inputs_to_node(_node, apply_state);
   // The memory state has to be applied separately: the vtn does not hold it. This allows reordering.
@@ -843,6 +942,16 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app
   return VTransformApplyResult::make_scalar(_node);
 }
 
+float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  // Since this is an identity transform, we may have nodes that also
+  // VLoopAnalyzer::cost does not count for the scalar loop.
+  if (vloop_analyzer.has_zero_cost(_node)) {
+    return 0;
+  } else {
+    return vloop_analyzer.cost_for_scalar_node(_node->Opcode());
+  }
+}
+
 VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const {
   apply_vtn_inputs_to_node(_node, apply_state);
   return VTransformApplyResult::make_scalar(_node);
@@ -895,6 +1004,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta
   return VTransformApplyResult::make_scalar(_node);
 }
 
+float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type);
+}
+
 VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const {
   Node* val = apply_state.transformed_node(in_req(1));
   VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type);
@@ -902,6 +1015,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L);
+}
+
 VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const {
   Node* val = apply_state.transformed_node(in_req(1));
   Node* n = new ConvI2LNode(val);
@@ -909,6 +1026,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s
   return VTransformApplyResult::make_scalar(n);
 }
 
+float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode);
+  return vloop_analyzer.cost_for_scalar_node(Op_AndI) +
+         vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt);
+}
+
 VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const {
   PhaseIdealLoop* phase = apply_state.phase();
   Node* shift_count_in = apply_state.transformed_node(in_req(1));
@@ -924,6 +1047,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt);
+}
 
 VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const {
   PhaseIdealLoop* phase = apply_state.phase();
@@ -936,6 +1062,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const {
   assert(2 <= req() && req() <= 4, "Must have 1-3 inputs");
   const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
@@ -954,6 +1084,12 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type());
+  return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) +
+         vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT);
+}
+
 VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const {
   uint vlen = vector_length();
   int sopc  = scalar_opcode();
@@ -969,6 +1105,10 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const {
   const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length());
   const TypeVect* src_vt = TypeVect::make(_src_bt,              vector_length());
@@ -981,6 +1121,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat
   return VTransformApplyResult::make_vector(vn);
 }
 
+float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  assert(scalar_opcode() == Op_Bool, "");
+  return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type());
+}
+
 VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const {
   const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length());
   assert(scalar_opcode() == Op_Bool, "");
@@ -1101,10 +1246,10 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
   const BasicType bt = element_basic_type();
   const int ropc     = vector_reduction_opcode();
   const int vopc     = VectorNode::opcode(sopc, bt);
-  if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) {
-    DEBUG_ONLY( this->print(); )
-    assert(false, "do not have normal vector op for this reduction");
-    return false; // not implemented
+  if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) {
+    // The element-wise vector operation needed for the vector accumulator
+    // is not implemented / supported.
+    return false;
   }
 
   // Traverse up the chain of non strict order reductions, checking that it loops
@@ -1236,6 +1381,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou
   return true; // success
 }
 
+float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  int vopc = vector_reduction_opcode();
+  bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc);
+  return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order);
+}
+
 VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const {
   Node* init = apply_state.transformed_node(in_req(1));
   Node* vec  = apply_state.transformed_node(in_req(2));
@@ -1245,6 +1398,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState&
   return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }
 
+float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const {
   int sopc     = scalar_opcode();
   uint vlen    = vector_length();
@@ -1274,6 +1433,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl
   return VTransformApplyResult::make_vector(vn, vn->vect_type());
 }
 
+float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const {
+  uint vlen    = vector_length();
+  BasicType bt = element_basic_type();
+  return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt);
+}
+
 VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const {
   int sopc  = scalar_opcode();
   uint vlen = vector_length();
diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp
index 7ad7b432e9b43..a30f0ff098faf 100644
--- a/src/hotspot/share/opto/vtransform.hpp
+++ b/src/hotspot/share/opto/vtransform.hpp
@@ -51,6 +51,10 @@
 //   - Compute linearization of the VTransformGraph, into an order that respects
 //     all edges in the graph (bailout if cycle detected).
 //
+// - Cost-Model:
+//   - We use a cost-model as a heuristic to determine if vectorization is profitable.
+//     Compute the cost of the loop with and without vectorization.
+//
 // - Apply:
 //   - Changes to the C2 IR are only made once the "apply" method is called.
 //   - Align the main loop, by adjusting pre loop limit.
@@ -190,6 +194,7 @@ class VTransformGraph : public StackObj {
   void optimize(VTransform& vtransform);
   bool schedule();
   bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const;
+  float cost_for_vector_loop() const;
   void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const;
 
 private:
@@ -200,6 +205,7 @@ class VTransformGraph : public StackObj {
 
   void collect_nodes_without_strong_in_edges(GrowableArray<VTransformNode*>& stack) const;
   int count_alive_vtnodes() const;
+  void mark_vtnodes_in_loop(VectorSet& in_loop) const;
 
 #ifndef PRODUCT
   void print_vtnodes() const;
@@ -252,6 +258,8 @@ class VTransform : public StackObj {
 
   void optimize() { return _graph.optimize(*this); }
   bool schedule() { return _graph.schedule(); }
+  bool is_profitable() const;
+  float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); }
   bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); }
   void apply();
 
@@ -549,6 +557,8 @@ class VTransformNode : public ArenaObj {
 
   virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0;
+
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0;
   virtual void apply_backedge(VTransformApplyState& apply_state) const {};
   void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const;
@@ -579,6 +589,7 @@ class VTransformMemopScalarNode : public VTransformNode {
   virtual bool is_load_or_store_in_loop() const override { return true; }
 
   virtual const VPointer& vpointer() const override { return _vpointer; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -595,6 +606,7 @@ class VTransformDataScalarNode : public VTransformNode {
     assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name());
   }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -612,6 +624,7 @@ class VTransformLoopPhiNode : public VTransformNode {
   }
 
   virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   virtual void apply_backedge(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };)
@@ -629,6 +642,7 @@ class VTransformCFGNode : public VTransformNode {
     assert(_node->is_CFG(), "must be CFG node: %s", _node->Name());
   }
 
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -655,6 +669,7 @@ class VTransformOuterNode : public VTransformNode {
     VTransformNode(vtransform, n->req()), _node(n) {}
 
   virtual VTransformOuterNode* isa_Outer() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -668,6 +683,7 @@ class VTransformReplicateNode : public VTransformNode {
 public:
   VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -677,6 +693,7 @@ class VTransformReplicateNode : public VTransformNode {
 class VTransformConvI2LNode : public VTransformNode {
 public:
   VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };)
 };
@@ -691,6 +708,7 @@ class VTransformShiftCountNode : public VTransformNode {
 public:
   VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -704,6 +722,7 @@ class VTransformPopulateIndexNode : public VTransformNode {
 public:
   VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) :
     VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -769,6 +788,7 @@ class VTransformElementWiseVectorNode : public VTransformVectorNode {
   VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) :
     VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {}
   virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -781,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect
 public:
   VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
     VTransformVectorNode(vtransform, 2, properties) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };)
 };
@@ -791,6 +812,7 @@ class VTransformReinterpretVectorNode : public VTransformVectorNode {
 public:
   VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) :
     VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {}
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -811,6 +833,7 @@ class VTransformCmpVectorNode : public VTransformVectorNode {
   VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) :
     VTransformVectorNode(vtransform, 3, properties) {}
   virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; }
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); }
   NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };)
 };
@@ -823,6 +846,7 @@ class VTransformBoolVectorNode : public VTransformVectorNode {
     VTransformVectorNode(vtransform, 2, properties), _test(test) {}
   VTransformBoolTest test() const { return _test; }
   virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };)
   NOT_PRODUCT(virtual void print_spec() const override;)
@@ -835,6 +859,7 @@ class VTransformReductionVectorNode : public VTransformVectorNode {
     VTransformVectorNode(vtransform, 3, properties) {}
   virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; }
   virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override;
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };)
 
@@ -877,6 +902,7 @@ class VTransformLoadVectorNode : public VTransformMemVectorNode {
   LoadNode::ControlDependency control_dependency() const;
   virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; }
   virtual bool is_load_in_loop() const override { return true; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };)
 };
@@ -888,6 +914,7 @@ class VTransformStoreVectorNode : public VTransformMemVectorNode {
     VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {}
   virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; }
   virtual bool is_load_in_loop() const override { return false; }
+  virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override;
   virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override;
   NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };)
 };
diff --git a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
index 457e33667b2d1..76c33ec1b0772 100644
--- a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
+++ b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java
@@ -410,12 +410,12 @@ public void run() {
 
     }
 
-    // Not vectorized: simple addition not profitable, see JDK-8307516. NOTE:
-    // This check does not document the _desired_ behavior of the system but
-    // the current behavior (no vectorization)
     @Test
-    @IR(counts = { IRNode.LOAD_VECTOR_I, "= 0",
-                   IRNode.STORE_VECTOR,  "= 0" })
+    @IR(counts = { IRNode.LOAD_VECTOR_I,     "> 0",
+                   IRNode.ADD_REDUCTION_VI,  "> 0",
+                   IRNode.ADD_VI,            "> 0" })
+    // The reduction is moved outside the loop, and we use a
+    // element-wise accumulator inside the loop.
     int test_sum(int[] a1) {
         int sum = 0;
         for (int i = 0; i < a1.length; i+=1) {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
index 10ad19d03a74d..89b46871cb56a 100644
--- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java
@@ -115,17 +115,18 @@ public static void checkSimpleFloatCopy() {
     @Test
     @Warmup(10)
     @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"},
         counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"})
     @IR(applyIfCPUFeatureOr = {"avx", "true"},
-        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"},
         counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"})
-    // Current heuristics say that this simple int reduction is not profitable.
-    // But it would actually be profitable, since we are able to move the
-    // reduction out of the loop (we can reorder the reduction). When moving
-    // the reduction out of the loop, we instead accumulate with a simple
-    // ADD_VI inside the loop.
-    // See: JDK-8307516 JDK-8345044
+    // We are able to vectorize the reduction. But on its own, that would
+    // not reduce the cost sufficiently in all cases, because vectorized
+    // reduction nodes are expensive. But since integer addition is associative
+    // we can move the reduction vector out of the loop. Instead, we accumulate
+    // with a simple ADD_VI inside the loop, which is very cheap. After the
+    // loop, we only need to use the vectorized reduction once, to collapse
+    // the partial sums contained in the lanes.
     private static int simpleIntReduction() {
         int sum = 0;
         for (int i = 0; i < aI.length; i++) {
diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
new file mode 100644
index 0000000000000..1cd5cfa1e750c
--- /dev/null
+++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
@@ -0,0 +1,2452 @@
+/*
+ * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+/*
+ * @test id=no-vectorization
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions P0
+ */
+
+/*
+ * @test id=vanilla
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions P1
+ */
+
+/*
+ * @test id=force-vectorization
+ * @bug 8340093
+ * @summary Test vectorization of reduction loops.
+ * @library /test/lib /
+ * @run driver compiler.loopopts.superword.TestReductions P2
+ */
+
+package compiler.loopopts.superword;
+
+import java.util.Map;
+import java.util.HashMap;
+
+import compiler.lib.ir_framework.*;
+import compiler.lib.verify.*;
+import static compiler.lib.generators.Generators.G;
+import compiler.lib.generators.Generator;
+
+/**
+ * Note: there is a corresponding JMH benchmark:
+ * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
+ */
+public class TestReductions {
+    private static int SIZE = 1024*8;
+    private static final Generator<Integer> GEN_I = G.ints();
+    private static final Generator<Long>    GEN_L = G.longs();
+    private static final Generator<Float>   GEN_F = G.floats();
+    private static final Generator<Double>  GEN_D = G.doubles();
+
+    private static byte[] in1B   = fillRandom(new byte[SIZE]);
+    private static byte[] in2B   = fillRandom(new byte[SIZE]);
+    private static byte[] in3B   = fillRandom(new byte[SIZE]);
+    private static char[] in1C   = fillRandom(new char[SIZE]);
+    private static char[] in2C   = fillRandom(new char[SIZE]);
+    private static char[] in3C   = fillRandom(new char[SIZE]);
+    private static short[] in1S  = fillRandom(new short[SIZE]);
+    private static short[] in2S  = fillRandom(new short[SIZE]);
+    private static short[] in3S  = fillRandom(new short[SIZE]);
+
+    private static int[] in1I    = fillRandom(new int[SIZE]);
+    private static int[] in2I    = fillRandom(new int[SIZE]);
+    private static int[] in3I    = fillRandom(new int[SIZE]);
+    private static long[] in1L   = fillRandom(new long[SIZE]);
+    private static long[] in2L   = fillRandom(new long[SIZE]);
+    private static long[] in3L   = fillRandom(new long[SIZE]);
+
+    private static float[] in1F  = fillRandom(new float[SIZE]);
+    private static float[] in2F  = fillRandom(new float[SIZE]);
+    private static float[] in3F  = fillRandom(new float[SIZE]);
+    private static double[] in1D = fillRandom(new double[SIZE]);
+    private static double[] in2D = fillRandom(new double[SIZE]);
+    private static double[] in3D = fillRandom(new double[SIZE]);
+
+    interface TestFunction {
+        Object run();
+    }
+
+    // Map of test names to tests.
+    Map<String,TestFunction> tests = new HashMap<String,TestFunction>();
+
+    // Map of gold, the results from the first run (before compilation), one per tests entry.
+    Map<String,Object> golds = new HashMap<String,Object>();
+
+    public static void main(String[] args) {
+        TestFramework framework = new TestFramework(TestReductions.class);
+        switch (args[0]) {
+            case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); }
+            case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); }
+            // Note: increasing the node count limit also helps in some cases.
+            case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2", "-XX:LoopUnrollLimit=1000"); }
+            default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); }
+        };
+        framework.start();
+    }
+
+    public TestReductions() {
+        // Add all tests to list
+        tests.put("byteAndSimple",       TestReductions::byteAndSimple);
+        tests.put("byteOrSimple",        TestReductions::byteOrSimple);
+        tests.put("byteXorSimple",       TestReductions::byteXorSimple);
+        tests.put("byteAddSimple",       TestReductions::byteAddSimple);
+        tests.put("byteMulSimple",       TestReductions::byteMulSimple);
+        tests.put("byteMinSimple",       TestReductions::byteMinSimple);
+        tests.put("byteMaxSimple",       TestReductions::byteMaxSimple);
+        tests.put("byteAndDotProduct",   TestReductions::byteAndDotProduct);
+        tests.put("byteOrDotProduct",    TestReductions::byteOrDotProduct);
+        tests.put("byteXorDotProduct",   TestReductions::byteXorDotProduct);
+        tests.put("byteAddDotProduct",   TestReductions::byteAddDotProduct);
+        tests.put("byteMulDotProduct",   TestReductions::byteMulDotProduct);
+        tests.put("byteMinDotProduct",   TestReductions::byteMinDotProduct);
+        tests.put("byteMaxDotProduct",   TestReductions::byteMaxDotProduct);
+        tests.put("byteAndBig",          TestReductions::byteAndBig);
+        tests.put("byteOrBig",           TestReductions::byteOrBig);
+        tests.put("byteXorBig",          TestReductions::byteXorBig);
+        tests.put("byteAddBig",          TestReductions::byteAddBig);
+        tests.put("byteMulBig",          TestReductions::byteMulBig);
+        tests.put("byteMinBig",          TestReductions::byteMinBig);
+        tests.put("byteMaxBig",          TestReductions::byteMaxBig);
+
+        tests.put("charAndSimple",       TestReductions::charAndSimple);
+        tests.put("charOrSimple",        TestReductions::charOrSimple);
+        tests.put("charXorSimple",       TestReductions::charXorSimple);
+        tests.put("charAddSimple",       TestReductions::charAddSimple);
+        tests.put("charMulSimple",       TestReductions::charMulSimple);
+        tests.put("charMinSimple",       TestReductions::charMinSimple);
+        tests.put("charMaxSimple",       TestReductions::charMaxSimple);
+        tests.put("charAndDotProduct",   TestReductions::charAndDotProduct);
+        tests.put("charOrDotProduct",    TestReductions::charOrDotProduct);
+        tests.put("charXorDotProduct",   TestReductions::charXorDotProduct);
+        tests.put("charAddDotProduct",   TestReductions::charAddDotProduct);
+        tests.put("charMulDotProduct",   TestReductions::charMulDotProduct);
+        tests.put("charMinDotProduct",   TestReductions::charMinDotProduct);
+        tests.put("charMaxDotProduct",   TestReductions::charMaxDotProduct);
+        tests.put("charAndBig",          TestReductions::charAndBig);
+        tests.put("charOrBig",           TestReductions::charOrBig);
+        tests.put("charXorBig",          TestReductions::charXorBig);
+        tests.put("charAddBig",          TestReductions::charAddBig);
+        tests.put("charMulBig",          TestReductions::charMulBig);
+        tests.put("charMinBig",          TestReductions::charMinBig);
+        tests.put("charMaxBig",          TestReductions::charMaxBig);
+
+        tests.put("shortAndSimple",      TestReductions::shortAndSimple);
+        tests.put("shortOrSimple",       TestReductions::shortOrSimple);
+        tests.put("shortXorSimple",      TestReductions::shortXorSimple);
+        tests.put("shortAddSimple",      TestReductions::shortAddSimple);
+        tests.put("shortMulSimple",      TestReductions::shortMulSimple);
+        tests.put("shortMinSimple",      TestReductions::shortMinSimple);
+        tests.put("shortMaxSimple",      TestReductions::shortMaxSimple);
+        tests.put("shortAndDotProduct",  TestReductions::shortAndDotProduct);
+        tests.put("shortOrDotProduct",   TestReductions::shortOrDotProduct);
+        tests.put("shortXorDotProduct",  TestReductions::shortXorDotProduct);
+        tests.put("shortAddDotProduct",  TestReductions::shortAddDotProduct);
+        tests.put("shortMulDotProduct",  TestReductions::shortMulDotProduct);
+        tests.put("shortMinDotProduct",  TestReductions::shortMinDotProduct);
+        tests.put("shortMaxDotProduct",  TestReductions::shortMaxDotProduct);
+        tests.put("shortAndBig",         TestReductions::shortAndBig);
+        tests.put("shortOrBig",          TestReductions::shortOrBig);
+        tests.put("shortXorBig",         TestReductions::shortXorBig);
+        tests.put("shortAddBig",         TestReductions::shortAddBig);
+        tests.put("shortMulBig",         TestReductions::shortMulBig);
+        tests.put("shortMinBig",         TestReductions::shortMinBig);
+        tests.put("shortMaxBig",         TestReductions::shortMaxBig);
+
+        tests.put("intAndSimple",        TestReductions::intAndSimple);
+        tests.put("intOrSimple",         TestReductions::intOrSimple);
+        tests.put("intXorSimple",        TestReductions::intXorSimple);
+        tests.put("intAddSimple",        TestReductions::intAddSimple);
+        tests.put("intMulSimple",        TestReductions::intMulSimple);
+        tests.put("intMinSimple",        TestReductions::intMinSimple);
+        tests.put("intMaxSimple",        TestReductions::intMaxSimple);
+        tests.put("intAndDotProduct",    TestReductions::intAndDotProduct);
+        tests.put("intOrDotProduct",     TestReductions::intOrDotProduct);
+        tests.put("intXorDotProduct",    TestReductions::intXorDotProduct);
+        tests.put("intAddDotProduct",    TestReductions::intAddDotProduct);
+        tests.put("intMulDotProduct",    TestReductions::intMulDotProduct);
+        tests.put("intMinDotProduct",    TestReductions::intMinDotProduct);
+        tests.put("intMaxDotProduct",    TestReductions::intMaxDotProduct);
+        tests.put("intAndBig",           TestReductions::intAndBig);
+        tests.put("intOrBig",            TestReductions::intOrBig);
+        tests.put("intXorBig",           TestReductions::intXorBig);
+        tests.put("intAddBig",           TestReductions::intAddBig);
+        tests.put("intMulBig",           TestReductions::intMulBig);
+        tests.put("intMinBig",           TestReductions::intMinBig);
+        tests.put("intMaxBig",           TestReductions::intMaxBig);
+
+        tests.put("longAndSimple",       TestReductions::longAndSimple);
+        tests.put("longOrSimple",        TestReductions::longOrSimple);
+        tests.put("longXorSimple",       TestReductions::longXorSimple);
+        tests.put("longAddSimple",       TestReductions::longAddSimple);
+        tests.put("longMulSimple",       TestReductions::longMulSimple);
+        tests.put("longMinSimple",       TestReductions::longMinSimple);
+        tests.put("longMaxSimple",       TestReductions::longMaxSimple);
+        tests.put("longAndDotProduct",   TestReductions::longAndDotProduct);
+        tests.put("longOrDotProduct",    TestReductions::longOrDotProduct);
+        tests.put("longXorDotProduct",   TestReductions::longXorDotProduct);
+        tests.put("longAddDotProduct",   TestReductions::longAddDotProduct);
+        tests.put("longMulDotProduct",   TestReductions::longMulDotProduct);
+        tests.put("longMinDotProduct",   TestReductions::longMinDotProduct);
+        tests.put("longMaxDotProduct",   TestReductions::longMaxDotProduct);
+        tests.put("longAndBig",          TestReductions::longAndBig);
+        tests.put("longOrBig",           TestReductions::longOrBig);
+        tests.put("longXorBig",          TestReductions::longXorBig);
+        tests.put("longAddBig",          TestReductions::longAddBig);
+        tests.put("longMulBig",          TestReductions::longMulBig);
+        tests.put("longMinBig",          TestReductions::longMinBig);
+        tests.put("longMaxBig",          TestReductions::longMaxBig);
+
+        tests.put("floatAddSimple",      TestReductions::floatAddSimple);
+        tests.put("floatMulSimple",      TestReductions::floatMulSimple);
+        tests.put("floatMinSimple",      TestReductions::floatMinSimple);
+        tests.put("floatMaxSimple",      TestReductions::floatMaxSimple);
+        tests.put("floatAddDotProduct",  TestReductions::floatAddDotProduct);
+        tests.put("floatMulDotProduct",  TestReductions::floatMulDotProduct);
+        tests.put("floatMinDotProduct",  TestReductions::floatMinDotProduct);
+        tests.put("floatMaxDotProduct",  TestReductions::floatMaxDotProduct);
+        tests.put("floatAddBig",         TestReductions::floatAddBig);
+        tests.put("floatMulBig",         TestReductions::floatMulBig);
+        tests.put("floatMinBig",         TestReductions::floatMinBig);
+        tests.put("floatMaxBig",         TestReductions::floatMaxBig);
+
+        tests.put("doubleAddSimple",     TestReductions::doubleAddSimple);
+        tests.put("doubleMulSimple",     TestReductions::doubleMulSimple);
+        tests.put("doubleMinSimple",     TestReductions::doubleMinSimple);
+        tests.put("doubleMaxSimple",     TestReductions::doubleMaxSimple);
+        tests.put("doubleAddDotProduct", TestReductions::doubleAddDotProduct);
+        tests.put("doubleMulDotProduct", TestReductions::doubleMulDotProduct);
+        tests.put("doubleMinDotProduct", TestReductions::doubleMinDotProduct);
+        tests.put("doubleMaxDotProduct", TestReductions::doubleMaxDotProduct);
+        tests.put("doubleAddBig",        TestReductions::doubleAddBig);
+        tests.put("doubleMulBig",        TestReductions::doubleMulBig);
+        tests.put("doubleMinBig",        TestReductions::doubleMinBig);
+        tests.put("doubleMaxBig",        TestReductions::doubleMaxBig);
+
+        // Compute gold value for all test methods before compilation
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            Object gold = test.run();
+            golds.put(name, gold);
+        }
+    }
+
+    @Warmup(100)
+    @Run(test = {"byteAndSimple",
+                 "byteOrSimple",
+                 "byteXorSimple",
+                 "byteAddSimple",
+                 "byteMulSimple",
+                 "byteMinSimple",
+                 "byteMaxSimple",
+                 "byteAndDotProduct",
+                 "byteOrDotProduct",
+                 "byteXorDotProduct",
+                 "byteAddDotProduct",
+                 "byteMulDotProduct",
+                 "byteMinDotProduct",
+                 "byteMaxDotProduct",
+                 "byteAndBig",
+                 "byteOrBig",
+                 "byteXorBig",
+                 "byteAddBig",
+                 "byteMulBig",
+                 "byteMinBig",
+                 "byteMaxBig",
+
+                 "charAndSimple",
+                 "charOrSimple",
+                 "charXorSimple",
+                 "charAddSimple",
+                 "charMulSimple",
+                 "charMinSimple",
+                 "charMaxSimple",
+                 "charAndDotProduct",
+                 "charOrDotProduct",
+                 "charXorDotProduct",
+                 "charAddDotProduct",
+                 "charMulDotProduct",
+                 "charMinDotProduct",
+                 "charMaxDotProduct",
+                 "charAndBig",
+                 "charOrBig",
+                 "charXorBig",
+                 "charAddBig",
+                 "charMulBig",
+                 "charMinBig",
+                 "charMaxBig",
+
+                 "shortAndSimple",
+                 "shortOrSimple",
+                 "shortXorSimple",
+                 "shortAddSimple",
+                 "shortMulSimple",
+                 "shortMinSimple",
+                 "shortMaxSimple",
+                 "shortAndDotProduct",
+                 "shortOrDotProduct",
+                 "shortXorDotProduct",
+                 "shortAddDotProduct",
+                 "shortMulDotProduct",
+                 "shortMinDotProduct",
+                 "shortMaxDotProduct",
+                 "shortAndBig",
+                 "shortOrBig",
+                 "shortXorBig",
+                 "shortAddBig",
+                 "shortMulBig",
+                 "shortMinBig",
+                 "shortMaxBig",
+
+                 "intAndSimple",
+                 "intOrSimple",
+                 "intXorSimple",
+                 "intAddSimple",
+                 "intMulSimple",
+                 "intMinSimple",
+                 "intMaxSimple",
+                 "intAndDotProduct",
+                 "intOrDotProduct",
+                 "intXorDotProduct",
+                 "intAddDotProduct",
+                 "intMulDotProduct",
+                 "intMinDotProduct",
+                 "intMaxDotProduct",
+                 "intAndBig",
+                 "intOrBig",
+                 "intXorBig",
+                 "intAddBig",
+                 "intMulBig",
+                 "intMinBig",
+                 "intMaxBig",
+
+                 "longAndSimple",
+                 "longOrSimple",
+                 "longXorSimple",
+                 "longAddSimple",
+                 "longMulSimple",
+                 "longMinSimple",
+                 "longMaxSimple",
+                 "longAndDotProduct",
+                 "longOrDotProduct",
+                 "longXorDotProduct",
+                 "longAddDotProduct",
+                 "longMulDotProduct",
+                 "longMinDotProduct",
+                 "longMaxDotProduct",
+                 "longAndBig",
+                 "longOrBig",
+                 "longXorBig",
+                 "longAddBig",
+                 "longMulBig",
+                 "longMinBig",
+                 "longMaxBig",
+
+                 "floatAddSimple",
+                 "floatMulSimple",
+                 "floatMinSimple",
+                 "floatMaxSimple",
+                 "floatAddDotProduct",
+                 "floatMulDotProduct",
+                 "floatMinDotProduct",
+                 "floatMaxDotProduct",
+                 "floatAddBig",
+                 "floatMulBig",
+                 "floatMinBig",
+                 "floatMaxBig",
+
+                 "doubleAddSimple",
+                 "doubleMulSimple",
+                 "doubleMinSimple",
+                 "doubleMaxSimple",
+                 "doubleAddDotProduct",
+                 "doubleMulDotProduct",
+                 "doubleMinDotProduct",
+                 "doubleMaxDotProduct",
+                 "doubleAddBig",
+                 "doubleMulBig",
+                 "doubleMinBig",
+                 "doubleMaxBig"})
+    public void runTests() {
+        for (Map.Entry<String,TestFunction> entry : tests.entrySet()) {
+            String name = entry.getKey();
+            TestFunction test = entry.getValue();
+            // Recall gold value from before compilation
+            Object gold = golds.get(name);
+            // Compute new result
+            Object result = test.run();
+            // Compare gold and new result
+            try {
+                Verify.checkEQ(gold, result);
+            } catch (VerifyException e) {
+                throw new RuntimeException("Verify failed for " + name, e);
+            }
+        }
+    }
+
+    static byte[] fillRandom(byte[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (byte)(int)GEN_I.next();
+        }
+        return a;
+    }
+
+    static char[] fillRandom(char[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (char)(int)GEN_I.next();
+        }
+        return a;
+    }
+
+    static short[] fillRandom(short[] a) {
+        for (int i = 0; i < a.length; i++) {
+            a[i] = (short)(int)GEN_I.next();
+        }
+        return a;
+    }
+
+    static int[] fillRandom(int[] a) {
+        G.fill(GEN_I, a);
+        return a;
+    }
+
+    static long[] fillRandom(long[] a) {
+        G.fill(GEN_L, a);
+        return a;
+    }
+
+    static float[] fillRandom(float[] a) {
+        G.fill(GEN_F, a);
+        return a;
+    }
+
+    static double[] fillRandom(double[] a) {
+        G.fill(GEN_D, a);
+        return a;
+    }
+
+    // ---------byte***Simple ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAndSimple() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteOrSimple() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteXorSimple() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAddSimple() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMulSimple() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMinSimple() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMaxSimple() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = in1B[i];
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------byte***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAndDotProduct() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteOrDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteXorDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAddDotProduct() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMulDotProduct() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMinDotProduct() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMaxDotProduct() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)(in1B[i] * in2B[i]);
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------byte***Big ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAndBig() {
+        byte acc = (byte)0xFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteOrBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteXorBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteAddBig() {
+        byte acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMulBig() {
+        byte acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMinBig() {
+        byte acc = Byte.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc = (byte)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future.
+    private static byte byteMaxBig() {
+        byte acc = Byte.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i]));
+            acc = (byte)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***Simple ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAndSimple() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charOrSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charXorSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAddSimple() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMulSimple() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMinSimple() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMaxSimple() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = in1C[i];
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAndDotProduct() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charOrDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charXorDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAddDotProduct() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMulDotProduct() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMinDotProduct() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMaxDotProduct() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)(in1C[i] * in2C[i]);
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------char***Big ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAndBig() {
+        char acc = (char)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charOrBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charXorBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charAddBig() {
+        char acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMulBig() {
+        char acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMinBig() {
+        char acc = Character.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc = (char)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future.
+    private static char charMaxBig() {
+        char acc = Character.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i]));
+            acc = (char)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***Simple ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAndSimple() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortOrSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortXorSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAddSimple() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMulSimple() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMinSimple() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMaxSimple() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = in1S[i];
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAndDotProduct() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortOrDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortXorDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAddDotProduct() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMulDotProduct() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMinDotProduct() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMaxDotProduct() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)(in1S[i] * in2S[i]);
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------short***Big ------------------------------------------------------------
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAndBig() {
+        short acc = (short)0xFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortOrBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortXorBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortAddBig() {
+        short acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMulBig() {
+        short acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMinBig() {
+        short acc = Short.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc = (short)Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future.
+    private static short shortMaxBig() {
+        short acc = Short.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i]));
+            acc = (short)Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***Simple ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAndSimple() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intOrSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intXorSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAddSimple() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMulSimple() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMinSimple() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMaxSimple() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAndDotProduct() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intOrDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intXorDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAddDotProduct() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMulDotProduct() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMinDotProduct() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMaxDotProduct() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = in1I[i] * in2I[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------int***Big ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAndBig() {
+        int acc = 0xFFFFFFFF; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intOrBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intXorBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.ADD_REDUCTION_VI, "> 0",
+                  IRNode.ADD_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intAddBig() {
+        int acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,    "> 0",
+                  IRNode.MUL_REDUCTION_VI, "> 0",
+                  IRNode.MUL_VI,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMulBig() {
+        int acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMinBig() {
+        int acc = Integer.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_I,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VI,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_I,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static int intMaxBig() {
+        int acc = Integer.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***Simple ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAndSimple() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longOrSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longXorSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAddSimple() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"}, // vector accumulator
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370673
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "= 0"}, // Reduction NOT moved out of loop
+        applyIfCPUFeatureOr = {"asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    // Note: NEON does not support MulVL for auto vectorization. There is
+    //       a scalarized implementation, but that is not profitable for
+    //       auto vectorization in almost all cases, and would not be
+    //       profitable here at any rate.
+    //       Hence, we have to keep the reduction inside the loop, and
+    //       cannot use the MulVL as the vector accumulator.
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMulSimple() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMinSimple() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMaxSimple() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAndDotProduct() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longOrDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longXorDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAddDotProduct() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"},
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370673
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // MulVL is not implemented on NEON, so we also not have the reduction.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMulDotProduct() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeature = {"avx512", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMinDotProduct() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeature = {"avx512", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMaxDotProduct() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = in1L[i] * in2L[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------long***Big ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.AND_REDUCTION_V, "> 0",
+                  IRNode.AND_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAndBig() {
+        long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc &= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,  "> 0",
+                  IRNode.OR_REDUCTION_V, "> 0",
+                  IRNode.OR_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longOrBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc |= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.XOR_REDUCTION_V, "> 0",
+                  IRNode.XOR_VL,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longXorBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc ^= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.ADD_REDUCTION_VL, "> 0",
+                  IRNode.ADD_VL,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longAddBig() {
+        long acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,    "> 0",
+                  IRNode.MUL_REDUCTION_VL, "> 0",
+                  IRNode.MUL_VL,           "> 0"},
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0",
+                      "LoopUnrollLimit", ">= 1000"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeature = {"avx512dq", "true"},
+        applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0",
+                      "LoopUnrollLimit", "< 1000"})
+    // Increasing the body limit seems to help. Filed for investigation: JDK-8370685
+    // If you can eliminate this exception for LoopUnrollLimit, please remove
+    // the flag completely from the test, also the "addFlags" at the top.
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // MulVL is not implemented on NEON, so we also not have the reduction.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMulBig() {
+        long acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VL,          "> 0"},
+        applyIfCPUFeature = {"avx512", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMinBig() {
+        long acc = Long.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_L,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VL,          "> 0"},
+        applyIfCPUFeature = {"avx512", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370671
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not.
+    // Filed: JDK-8370686
+    @IR(failOn = IRNode.LOAD_VECTOR_L,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static long longMaxBig() {
+        long acc = Long.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***Simple ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n adds
+    // Vector: n loads + n adds + n extract (sequential order of reduction)
+    private static float floatAddSimple() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n mul
+    // Vector: n loads + n mul + n extract (sequential order of reduction)
+    private static float floatMulSimple() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMinSimple() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMaxSimple() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatAddDotProduct() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMulDotProduct() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMinDotProduct() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMaxDotProduct() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = in1F[i] * in2F[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------float***Big ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VF,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatAddBig() {
+        float acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,    "> 0",
+                  IRNode.MUL_REDUCTION_VF, "> 0",
+                  IRNode.MUL_VF,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMulBig() {
+        float acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMinBig() {
+        float acc = Float.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_F,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VF,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_F,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static float floatMaxBig() {
+        float acc = Float.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***Simple ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.ADD_REDUCTION_VD, "> 0",
+                  IRNode.ADD_VD,           "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n adds
+    // Vector: n loads + n adds + n extract (sequential order of reduction)
+    private static double doubleAddSimple() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 2"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "< 2"})
+    // Not considered profitable by cost model, but if forced we can vectorize.
+    // Scalar: n loads + n mul
+    // Vector: n loads + n mul + n extract (sequential order of reduction)
+    private static double doubleMulSimple() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMinSimple() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMaxSimple() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***DotProduct ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VD,          "= 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleAddDotProduct() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMulDotProduct() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMinDotProduct() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMaxDotProduct() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = in1D[i] * in2D[i];
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+    // ---------double***Big ------------------------------------------------------------
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.ADD_REDUCTION_V, "> 0",
+                  IRNode.ADD_VD,          "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleAddBig() {
+        double acc = 0; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc += val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,    "> 0",
+                  IRNode.MUL_REDUCTION_VD, "> 0",
+                  IRNode.MUL_VD,           "> 0"},
+        applyIfCPUFeature = {"sse4.1", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIfCPUFeatureAnd = {"asimd", "true"})
+    // I think this could vectorize, but currently does not. Filed: JDK-8370677
+    // But: it is not clear that it would be profitable, given the sequential reduction.
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMulBig() {
+        double acc = 1; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc *= val;
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MIN_REDUCTION_V, "> 0",
+                  IRNode.MIN_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMinBig() {
+        double acc = Double.MAX_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc = Math.min(acc, val);
+        }
+        return acc;
+    }
+
+    @Test
+    @IR(counts = {IRNode.LOAD_VECTOR_D,   "> 0",
+                  IRNode.MAX_REDUCTION_V, "> 0",
+                  IRNode.MAX_VD,          "> 0"},
+        applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"},
+        applyIf = {"AutoVectorizationOverrideProfitability", "> 0"})
+    @IR(failOn = IRNode.LOAD_VECTOR_D,
+        applyIf = {"AutoVectorizationOverrideProfitability", "= 0"})
+    private static double doubleMaxBig() {
+        double acc = Double.MIN_VALUE; // neutral element
+        for (int i = 0; i < SIZE; i++) {
+            double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]);
+            acc = Math.max(acc, val);
+        }
+        return acc;
+    }
+
+
+}
diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
index ec614cb324bc2..63fbf03008301 100644
--- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
+++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java
@@ -28,6 +28,10 @@
 import java.util.concurrent.TimeUnit;
 import java.util.Random;
 
+/**
+ * Note: there is a corresponding IR test:
+ * test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java
+ */
 @BenchmarkMode(Mode.AverageTime)
 @OutputTimeUnit(TimeUnit.NANOSECONDS)
 @State(Scope.Thread)