From 3d7bca8358c292c787bc31ed91ebb7b5e14b2cff Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 18:52:23 +0100
Subject: [PATCH 01/13] 8325589

---
 src/hotspot/share/opto/loopopts.cpp           |  5 ++
 src/hotspot/share/opto/superword.cpp          | 10 ----
 .../share/opto/traceAutoVectorizationTag.hpp  |  1 +
 src/hotspot/share/opto/vectorization.cpp      | 34 +++++++++++++
 src/hotspot/share/opto/vectorization.hpp      | 50 +++++++++++++++++++
 5 files changed, 90 insertions(+), 10 deletions(-)

diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index c5d8ed39d9d0c..db8e44f08e92d 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -4232,6 +4232,11 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
   // Ensure the shared data is cleared before each use
   vshared.clear();
 
+  const VLoopAnalyzer vloop_analyzer(vloop, vshared);
+  if (!vloop_analyzer.success()) {
+    return AutoVectorizeStatus::TriedAndFailed;
+  }
+
   SuperWord sw(vloop, vshared);
   if (!sw.transform_loop()) {
     return AutoVectorizeStatus::TriedAndFailed;
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index aa1edd01ab19e..10589d31945c2 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -399,16 +399,6 @@ bool SuperWord::transform_loop() {
   }
 #endif
 
-  // Skip any loop that has not been assigned max unroll by analysis
-  if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
-#ifndef PRODUCT
-    if (is_trace_superword_any()) {
-      tty->print_cr("\nSuperWord::transform_loop failed: slp max unroll analysis was not already done");
-    }
-#endif
-    return false;
-  }
-
   if (!SLP_extract()) {
 #ifndef PRODUCT
     if (is_trace_superword_any()) {
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index 79157aca309d6..78f1301010aae 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -31,6 +31,7 @@
 #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \
   flags(POINTER_ANALYSIS,     "Trace VPointer") \
   flags(PRECONDITIONS,        "Trace VLoop::check_preconditions") \
+  flags(LOOP_ANALYZER,        "Trace VLoopAnalyzer::setup_submodules") \
   flags(SW_TYPES,             "Trace SuperWord::compute_vector_element_type") \
   flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
   flags(SW_MEMORY_SLICES,     "Trace SuperWord memory slices") \
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 98b996339fa46..a867561c1aa45 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -114,6 +114,40 @@ const char* VLoop::check_preconditions_helper() {
   return VLoop::SUCCESS;
 }
 
+// Return true iff all submodules are loaded successfully
+bool VLoopAnalyzer::setup_submodules() {
+#ifndef PRODUCT
+  if (vloop().is_trace_loop_analyzer()) {
+    tty->print_cr("\nVLoopAnalyzer::setup_submodules");
+    vloop().lpt()->dump_head();
+    vloop().cl()->dump();
+  }
+#endif
+
+  const char* state = setup_submodules_helper();
+    if (state == VLoopAnalyzer::SUCCESS) {
+    return true; // success
+  }
+
+#ifndef PRODUCT
+  if (vloop().is_trace_loop_analyzer()) {
+    tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", state);
+  }
+#endif
+  return false; // failed
+}
+
+// Return SUCCESS string iff all submodules are setup successfully
+const char* VLoopAnalyzer::setup_submodules_helper() {
+  // Skip any loop that has not been assigned max unroll by analysis.
+  if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
+    return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL;
+  }
+
+  // TODO
+  return VLoopAnalyzer::SUCCESS;
+}
+
 #ifndef PRODUCT
 int VPointer::Tracer::_depth = 0;
 #endif
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 7aff58db4bb30..6106cb8547ccf 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -111,6 +111,10 @@ class VLoop : public StackObj {
     return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
   }
 
+  bool is_trace_loop_analyzer() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER);
+  }
+
   bool is_trace_pointer_analysis() const {
     return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
   }
@@ -166,6 +170,52 @@ class VSharedData : public StackObj {
   }
 };
 
+// TODO submodules
+
+// Analyze the loop in preparation for auto-vectorization. This class is
+// deliberately structured into many submodules, which are as independent
+// as possible, though some submodules do require other submodules.
+class VLoopAnalyzer : StackObj {
+private:
+  // TODO check if all are really needed
+  static constexpr char const* SUCCESS                       = "success";
+  static constexpr char const* FAILURE_NO_MAX_UNROLL         = "slp max unroll analysis required";
+  static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop";
+
+  const VLoop&               _vloop;
+
+  // Arena for all submodules
+  Arena                      _arena;
+
+  // If all submodules are setup successfully, we set this flag at the
+  // end of the constructor
+  bool                       _success;
+
+  // Submodules
+  // TODO
+
+public:
+  VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) :
+    _vloop(vloop),
+    _arena(mtCompiler),
+    _success(false)
+    // TODO modules
+  {
+    _success = setup_submodules();
+  }
+  NONCOPYABLE(VLoopAnalyzer);
+
+  bool success() const { return _success; }
+
+  // Read-only accessors for submodules
+  const VLoop& vloop()                           const { return _vloop; }
+  // TODO
+
+private:
+  bool setup_submodules();
+  const char* setup_submodules_helper();
+};
+
 // A vectorization pointer (VPointer) has information about an address for
 // dependence checking and vector alignment. It's usually bound to a memory
 // operation in a counted loop for vectorizable analysis.

From 4fcbad64628b70ad7935a88d53802758c59d397d Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 19:52:43 +0100
Subject: [PATCH 02/13] VLoopReductions

---
 src/hotspot/cpu/x86/x86_64.ad            | 16 ++--
 src/hotspot/share/opto/loopopts.cpp      |  2 +-
 src/hotspot/share/opto/superword.cpp     | 84 ++++++++-------------
 src/hotspot/share/opto/superword.hpp     | 93 ++++++------------------
 src/hotspot/share/opto/vectorization.cpp | 14 ++++
 src/hotspot/share/opto/vectorization.hpp | 84 ++++++++++++++++++++-
 6 files changed, 158 insertions(+), 135 deletions(-)

diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad
index a248daaa1917b..eb063c9563a2a 100644
--- a/src/hotspot/cpu/x86/x86_64.ad
+++ b/src/hotspot/cpu/x86/x86_64.ad
@@ -4480,7 +4480,7 @@ instruct loadD(regD dst, memory mem)
 
 // max = java.lang.Math.max(float a, float b)
 instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
-  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n));
   match(Set dst (MaxF a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
   format %{ "maxF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
@@ -4491,7 +4491,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
 %}
 
 instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && VLoopReductions::is_reduction(n));
   match(Set dst (MaxF a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -4505,7 +4505,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
 
 // max = java.lang.Math.max(double a, double b)
 instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
-  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n));
   match(Set dst (MaxD a b));
   effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp);
   format %{ "maxD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
@@ -4516,7 +4516,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
 %}
 
 instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && VLoopReductions::is_reduction(n));
   match(Set dst (MaxD a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -4530,7 +4530,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe
 
 // min = java.lang.Math.min(float a, float b)
 instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{
-  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n));
   match(Set dst (MinF a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
   format %{ "minF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
@@ -4541,7 +4541,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp,
 %}
 
 instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && VLoopReductions::is_reduction(n));
   match(Set dst (MinF a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
@@ -4555,7 +4555,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe
 
 // min = java.lang.Math.min(double a, double b)
 instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{
-  predicate(UseAVX > 0 && !SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n));
   match(Set dst (MinD a b));
   effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp);
     format %{ "minD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %}
@@ -4566,7 +4566,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp,
 %}
 
 instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{
-  predicate(UseAVX > 0 && SuperWord::is_reduction(n));
+  predicate(UseAVX > 0 && VLoopReductions::is_reduction(n));
   match(Set dst (MinD a b));
   effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr);
 
diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index db8e44f08e92d..406158eaee42f 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -4237,7 +4237,7 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
     return AutoVectorizeStatus::TriedAndFailed;
   }
 
-  SuperWord sw(vloop, vshared);
+  SuperWord sw(vloop_analyzer, vshared);
   if (!sw.transform_loop()) {
     return AutoVectorizeStatus::TriedAndFailed;
   }
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 10589d31945c2..3e7aeb2432a7b 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -38,20 +38,19 @@
 #include "opto/movenode.hpp"
 #include "utilities/powerOfTwo.hpp"
 
-SuperWord::SuperWord(const VLoop &vloop, VSharedData &vshared) :
-  _vloop(vloop),
+SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared) :
+  _vloop_analyzer(vloop_analyzer),
   _arena(mtCompiler),
   _packset(arena(), 8,  0, nullptr),                        // packs for the current block
   _bb_idx(vshared.node_idx_to_loop_body_idx()),             // node idx to index in bb
-  _block(arena(), vloop.estimated_body_length(), 0, nullptr), // nodes in current block
+  _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block
   _mem_slice_head(arena(), 8,  0, nullptr),                 // memory slice heads
   _mem_slice_tail(arena(), 8,  0, nullptr),                 // memory slice tails
-  _node_info(arena(), vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
+  _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
   _align_to_ref(nullptr),                                   // memory reference to align vectors to
   _dg(arena()),                                             // dependence graph
-  _nlist(arena(), vloop.estimated_body_length(), 0, nullptr), // scratch list of nodes
-  _loop_reductions(arena()),                                // reduction nodes in the current loop
+  _nlist(arena(), vloop().estimated_body_length(), 0, nullptr), // scratch list of nodes
   _race_possible(false),                                    // cases where SDMU is true
   _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
   _num_work_vecs(0),                                        // amount of vector work we have
@@ -255,7 +254,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa
   }
 }
 
-bool SuperWord::is_reduction(const Node* n) {
+bool VLoopReductions::is_reduction(const Node* n) {
   if (!is_reduction_operator(n)) {
     return false;
   }
@@ -269,12 +268,12 @@ bool SuperWord::is_reduction(const Node* n) {
   return false;
 }
 
-bool SuperWord::is_reduction_operator(const Node* n) {
+bool VLoopReductions::is_reduction_operator(const Node* n) {
   int opc = n->Opcode();
   return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type()));
 }
 
-bool SuperWord::in_reduction_cycle(const Node* n, uint input) {
+bool VLoopReductions::in_reduction_cycle(const Node* n, uint input) {
   // First find input reduction path to phi node.
   auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); };
   PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode,
@@ -291,7 +290,7 @@ bool SuperWord::in_reduction_cycle(const Node* n, uint input) {
   return path_from_phi.first != nullptr;
 }
 
-Node* SuperWord::original_input(const Node* n, uint i) {
+Node* VLoopReductions::original_input(const Node* n, uint i) {
   if (n->has_swapped_edges()) {
     assert(n->is_Add() || n->is_Mul(), "n should be commutative");
     if (i == 1) {
@@ -303,21 +302,21 @@ Node* SuperWord::original_input(const Node* n, uint i) {
   return n->in(i);
 }
 
-void SuperWord::mark_reductions() {
-
-  _loop_reductions.clear();
+void VLoopReductions::mark_reductions() {
+  assert(_loop_reductions.is_empty(), "must not yet be computed");
+  CountedLoopNode* cl = vloop().cl();
 
   // Iterate through all phi nodes associated to the loop and search for
   // reduction cycles in the basic block.
-  for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
-    const Node* phi = cl()->fast_out(i);
+  for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
+    const Node* phi = cl->fast_out(i);
     if (!phi->is_Phi()) {
       continue;
     }
     if (phi->outcnt() == 0) {
       continue;
     }
-    if (phi == iv()) {
+    if (phi == vloop().iv()) {
       continue;
     }
     // The phi's loop-back is considered the first node in the reduction cycle.
@@ -341,8 +340,9 @@ void SuperWord::mark_reductions() {
       // to the phi node following edge index 'input'.
       PathEnd path =
         find_in_path(
-          first, input, lpt()->_body.size(),
-          [&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); },
+          first, input, vloop().lpt()->_body.size(),
+          [&](const Node* n) { return n->Opcode() == first->Opcode() &&
+                                      vloop().in_bb(n); },
           [&](const Node* n) { return n == phi; });
       if (path.first != nullptr) {
         reduction_input = input;
@@ -361,7 +361,7 @@ void SuperWord::mark_reductions() {
     for (int i = 0; i < path_nodes; i++) {
       for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
         Node* u = current->fast_out(j);
-        if (!in_bb(u)) {
+        if (!vloop().in_bb(u)) {
           continue;
         }
         if (u == succ) {
@@ -381,6 +381,7 @@ void SuperWord::mark_reductions() {
     }
     // Reduction cycle found. Mark all nodes in the found path as reductions.
     current = first;
+    // TODO trace this
     for (int i = 0; i < path_nodes; i++) {
       _loop_reductions.set(current->_idx);
       current = original_input(current, reduction_input);
@@ -453,24 +454,11 @@ bool SuperWord::transform_loop() {
 bool SuperWord::SLP_extract() {
   assert(cl()->is_main_loop(), "SLP should only work on main loops");
 
-  if (SuperWordReductions) {
-    mark_reductions();
-  }
+  // TODO remove all the VLoopAnalyzer stuff
 
   // Find memory slices
   find_memory_slices();
 
-  if (!is_marked_reduction_loop() &&
-      _mem_slice_head.is_empty()) {
-#ifndef PRODUCT
-    if (is_trace_superword_any()) {
-      tty->print_cr("\nNo reductions or memory slices found, abort SuperWord.");
-      tty->cr();
-    }
-#endif
-    return false;
-  }
-
   // Ready the block
   if (!construct_bb()) {
 #ifndef PRODUCT
@@ -1120,26 +1108,19 @@ bool SuperWord::have_similar_inputs(Node* s1, Node* s2) {
   return true;
 }
 
-//------------------------------reduction---------------------------
-// Is there a data path between s1 and s2 and the nodes reductions?
-bool SuperWord::reduction(Node* s1, Node* s2) {
-  bool retValue = false;
-  int d1 = depth(s1);
-  int d2 = depth(s2);
-  if (d2 > d1) {
-    if (is_marked_reduction(s1) && is_marked_reduction(s2)) {
-      // This is an ordered set, so s1 should define s2
-      for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
-        Node* t1 = s1->fast_out(i);
-        if (t1 == s2) {
-          // both nodes are reductions and connected
-          retValue = true;
-        }
+bool VLoopReductions::is_marked_reduction_pair(Node* s1, Node* s2) const {
+  if (is_marked_reduction(s1) &&
+      is_marked_reduction(s2)) {
+    // This is an ordered set, so s1 should define s2
+    for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) {
+      Node* t1 = s1->fast_out(i);
+      if (t1 == s2) {
+        // both nodes are reductions and connected
+        return true;
       }
     }
   }
-
-  return retValue;
+  return false;
 }
 
 //------------------------------set_alignment---------------------------
@@ -1876,9 +1857,8 @@ bool SuperWord::profitable(Node_List* p) {
     Node* second_in = p0->in(2);
     Node_List* second_pk = my_pack(second_in);
     if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) {
-      // Unmark reduction if no parent pack or if not enough work
+      // No parent pack or not enough work
       // to cover reduction expansion overhead
-      _loop_reductions.remove(p0->_idx);
       return false;
     } else if (second_pk->size() != p->size()) {
       return false;
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 691aa97928a2c..06ba6653d2766 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -26,7 +26,6 @@
 
 #include "opto/vectorization.hpp"
 #include "utilities/growableArray.hpp"
-#include "utilities/pair.hpp"
 
 //
 //                  S U P E R W O R D   T R A N S F O R M
@@ -199,7 +198,7 @@ class SWNodeInfo {
 // Transforms scalar operations into packed (superword) operations.
 class SuperWord : public ResourceObj {
  private:
-  const VLoop& _vloop;
+  const VLoopAnalyzer& _vloop_analyzer;
 
   // Arena for small data structures. Large data structures are allocated in
   // VSharedData, and reused over many AutoVectorizations.
@@ -224,7 +223,7 @@ class SuperWord : public ResourceObj {
   GrowableArray<Node*> _nlist; // List of nodes
 
  public:
-  SuperWord(const VLoop &vloop, VSharedData &vshared);
+  SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared);
 
   // Attempt to run the SuperWord algorithm on the loop. Return true if we succeed.
   bool transform_loop();
@@ -232,15 +231,26 @@ class SuperWord : public ResourceObj {
   // Decide if loop can eventually be vectorized, and what unrolling factor is required.
   static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
 
+  // VLoopAnalyzer Accessors
+  const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
+
   // VLoop Accessors
-  const VLoop& vloop()        const { return _vloop; }
-  PhaseIdealLoop* phase()     const { return vloop().phase(); }
-  PhaseIterGVN& igvn()        const { return vloop().phase()->igvn(); }
-  IdealLoopTree* lpt()        const { return vloop().lpt(); }
-  CountedLoopNode* cl()       const { return vloop().cl(); }
-  PhiNode* iv()               const { return vloop().iv(); }
-  int iv_stride()             const { return cl()->stride_con(); }
-  bool in_bb(const Node* n)   const { return vloop().in_bb(n); }
+  const VLoop& vloop()                  const { return vloop_analyzer().vloop(); }
+  PhaseIdealLoop* phase()               const { return vloop().phase(); }
+  PhaseIterGVN& igvn()                  const { return vloop().phase()->igvn(); }
+  IdealLoopTree* lpt()                  const { return vloop().lpt(); }
+  CountedLoopNode* cl()                 const { return vloop().cl(); }
+  PhiNode* iv()                         const { return vloop().iv(); }
+  int iv_stride()                       const { return cl()->stride_con(); }
+  bool in_bb(const Node* n)             const { return vloop().in_bb(n); }
+
+  // VLoopReductions Accessors
+  bool is_marked_reduction(const Node* n) const {
+    return vloop_analyzer().reductions().is_marked_reduction(n);
+  }
+  bool reduction(Node* s1, Node* s2) const {
+    return vloop_analyzer().reductions().is_marked_reduction_pair(s1, s2);
+  }
 
 #ifndef PRODUCT
   // TraceAutoVectorization and TraceSuperWord
@@ -315,7 +325,6 @@ class SuperWord : public ResourceObj {
   const GrowableArray<Node*>&      block()   const { return _block; }
   const DepGraph&                  dg()      const { return _dg; }
  private:
-  VectorSet      _loop_reductions; // Reduction nodes in the current loop
   bool           _race_possible;   // In cases where SDMU is true
   bool           _do_vector_loop;  // whether to do vectorization/simd style
   int            _num_work_vecs;   // Number of non memory vector operations
@@ -376,65 +385,7 @@ class SuperWord : public ResourceObj {
   bool same_origin_idx(Node* a, Node* b) const;
   bool same_generation(Node* a, Node* b) const;
 
-  // methods
-
-  typedef const Pair<const Node*, int> PathEnd;
-
-  // Search for a path P = (n_1, n_2, ..., n_k) such that:
-  // - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
-  // - path(n) for all n in P,
-  // - k <= max, and
-  // - there exists a node e such that original_input(n_k, input) = e and end(e).
-  // Return <e, k>, if P is found, or <nullptr, -1> otherwise.
-  // Note that original_input(n, i) has the same behavior as n->in(i) except
-  // that it commutes the inputs of binary nodes whose edges have been swapped.
-  template <typename NodePredicate1, typename NodePredicate2>
-  static PathEnd find_in_path(const Node *n1, uint input, int max,
-                              NodePredicate1 path, NodePredicate2 end) {
-    const PathEnd no_path(nullptr, -1);
-    const Node* current = n1;
-    int k = 0;
-    for (int i = 0; i <= max; i++) {
-      if (current == nullptr) {
-        return no_path;
-      }
-      if (end(current)) {
-        return PathEnd(current, k);
-      }
-      if (!path(current)) {
-        return no_path;
-      }
-      current = original_input(current, input);
-      k++;
-    }
-    return no_path;
-  }
-
-public:
-  // Whether n is a reduction operator and part of a reduction cycle.
-  // This function can be used for individual queries outside the SLP analysis,
-  // e.g. to inform matching in target-specific code. Otherwise, the
-  // almost-equivalent but faster SuperWord::mark_reductions() is preferable.
-  static bool is_reduction(const Node* n);
-  // Whether n is marked as a reduction node.
-  bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); }
-  // Whether the current loop has any reduction node.
-  bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); }
 private:
-  // Whether n is a standard reduction operator.
-  static bool is_reduction_operator(const Node* n);
-  // Whether n is part of a reduction cycle via the 'input' edge index. To bound
-  // the search, constrain the size of reduction cycles to LoopMaxUnroll.
-  static bool in_reduction_cycle(const Node* n, uint input);
-  // Reference to the i'th input node of n, commuting the inputs of binary nodes
-  // whose edges have been swapped. Assumes n is a commutative operation.
-  static Node* original_input(const Node* n, uint i);
-  // Find and mark reductions in a loop. Running mark_reductions() is similar to
-  // querying is_reduction(n) for every n in the SuperWord loop, but stricter in
-  // that it assumes counted loops and requires that reduction nodes are not
-  // used within the loop except by their reduction cycle predecessors.
-  void mark_reductions();
-  // Extract the superword level parallelism
   bool SLP_extract();
   // Find the adjacent memory references and create pack pairs for them.
   void find_adjacent_refs();
@@ -466,8 +417,6 @@ class SuperWord : public ResourceObj {
   // For a node pair (s1, s2) which is isomorphic and independent,
   // do s1 and s2 have similar input edges?
   bool have_similar_inputs(Node* s1, Node* s2);
-  // Is there a data path between s1 and s2 and both are reductions?
-  bool reduction(Node* s1, Node* s2);
   void set_alignment(Node* s1, Node* s2, int align);
   int data_size(Node* s);
   // Extend packset by following use->def and def->use links from pack members.
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index a867561c1aa45..a1369da7f4df3 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -144,6 +144,20 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
     return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL;
   }
 
+  if (SuperWordReductions) {
+    _reductions.mark_reductions();
+  }
+
+  // TODO _memory_slices.analyze();
+
+  // // If there is no memory slice detected, that means there is no store.
+  // // If there is no reduction and no store, then we give up, because
+  // // vectorization is not possible anyway (given current limitations).
+  // if (!reductions().is_marked_reduction_loop() &&
+  //     _memory_slices.heads().is_empty()) {
+  //   return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE;
+  // }
+
   // TODO
   return VLoopAnalyzer::SUCCESS;
 }
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 6106cb8547ccf..30c386465fd9f 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -28,6 +28,7 @@
 #include "opto/node.hpp"
 #include "opto/loopnode.hpp"
 #include "opto/traceAutoVectorizationTag.hpp"
+#include "utilities/pair.hpp"
 
 // Code in this file and the vectorization.cpp contains shared logics and
 // utilities for C2's loop auto-vectorization.
@@ -170,7 +171,81 @@ class VSharedData : public StackObj {
   }
 };
 
-// TODO submodules
+// Submodule of VLoopAnalyzer.
+// Identify and mark all reductions in the loop.
+class VLoopReductions : public StackObj {
+private:
+  typedef const Pair<const Node*, int> PathEnd;
+
+  const VLoop& _vloop;
+  VectorSet _loop_reductions;
+
+public:
+  VLoopReductions(Arena* arena, const VLoop& vloop) :
+    _vloop(vloop),
+    _loop_reductions(arena){};
+
+  NONCOPYABLE(VLoopReductions);
+
+private:
+  const VLoop& vloop() const { return _vloop; }
+
+  // Search for a path P = (n_1, n_2, ..., n_k) such that:
+  // - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
+  // - path(n) for all n in P,
+  // - k <= max, and
+  // - there exists a node e such that original_input(n_k, input) = e and end(e).
+  // Return <e, k>, if P is found, or <nullptr, -1> otherwise.
+  // Note that original_input(n, i) has the same behavior as n->in(i) except
+  // that it commutes the inputs of binary nodes whose edges have been swapped.
+  template <typename NodePredicate1, typename NodePredicate2>
+  static PathEnd find_in_path(const Node* n1, uint input, int max,
+                              NodePredicate1 path, NodePredicate2 end) {
+    const PathEnd no_path(nullptr, -1);
+    const Node* current = n1;
+    int k = 0;
+    for (int i = 0; i <= max; i++) {
+      if (current == nullptr) {
+        return no_path;
+      }
+      if (end(current)) {
+        return PathEnd(current, k);
+      }
+      if (!path(current)) {
+        return no_path;
+      }
+      current = original_input(current, input);
+      k++;
+    }
+    return no_path;
+  }
+
+public:
+  // Find and mark reductions in a loop. Running mark_reductions() is similar to
+  // querying is_reduction(n) for every node in the loop, but stricter in
+  // that it assumes counted loops and requires that reduction nodes are not
+  // used within the loop except by their reduction cycle predecessors.
+  void mark_reductions();
+  // Whether n is a reduction operator and part of a reduction cycle.
+  // This function can be used for individual queries outside auto-vectorization,
+  // e.g. to inform matching in target-specific code. Otherwise, the
+  // almost-equivalent but faster mark_reductions() is preferable.
+  static bool is_reduction(const Node* n);
+  // Whether n is marked as a reduction node.
+  bool is_marked_reduction(const Node* n) const { return _loop_reductions.test(n->_idx); }
+  bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); }
+  // Are s1 and s2 reductions with a data path between them?
+  bool is_marked_reduction_pair(Node* s1, Node* s2) const;
+private:
+  // Whether n is a standard reduction operator.
+  static bool is_reduction_operator(const Node* n);
+  // Whether n is part of a reduction cycle via the 'input' edge index. To bound
+  // the search, constrain the size of reduction cycles to LoopMaxUnroll.
+  static bool in_reduction_cycle(const Node* n, uint input);
+  // Reference to the i'th input node of n, commuting the inputs of binary nodes
+  // whose edges have been swapped. Assumes n is a commutative operation.
+  static Node* original_input(const Node* n, uint i);
+};
 
 // Analyze the loop in preparation for auto-vectorization. This class is
 // deliberately structured into many submodules, which are as independent
@@ -193,12 +268,14 @@ class VLoopAnalyzer : StackObj {
 
   // Submodules
   // TODO
+  VLoopReductions            _reductions;
 
 public:
   VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) :
     _vloop(vloop),
     _arena(mtCompiler),
-    _success(false)
+    _success(false),
+    _reductions      (&_arena, vloop)
     // TODO modules
   {
     _success = setup_submodules();
@@ -207,8 +284,11 @@ class VLoopAnalyzer : StackObj {
 
   bool success() const { return _success; }
 
+  Arena* arena()       { return &_arena; }
+
   // Read-only accessors for submodules
   const VLoop& vloop()                           const { return _vloop; }
+  const VLoopReductions& reductions()            const { return _reductions; }
   // TODO
 
 private:

From 760b3798dc5c02acad5720a505ac22472010385b Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 20:46:28 +0100
Subject: [PATCH 03/13] VLoopMemorySlices

---
 src/hotspot/share/opto/superword.cpp          | 115 +++++++++---------
 src/hotspot/share/opto/superword.hpp          |  28 ++---
 .../share/opto/traceAutoVectorizationTag.hpp  |   4 +-
 src/hotspot/share/opto/vectorization.cpp      |  18 +--
 src/hotspot/share/opto/vectorization.hpp      |  53 +++++++-
 5 files changed, 128 insertions(+), 90 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 3e7aeb2432a7b..a0d56f08a8124 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -44,13 +44,10 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared)
   _packset(arena(), 8,  0, nullptr),                        // packs for the current block
   _bb_idx(vshared.node_idx_to_loop_body_idx()),             // node idx to index in bb
   _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block
-  _mem_slice_head(arena(), 8,  0, nullptr),                 // memory slice heads
-  _mem_slice_tail(arena(), 8,  0, nullptr),                 // memory slice tails
   _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
   _align_to_ref(nullptr),                                   // memory reference to align vectors to
   _dg(arena()),                                             // dependence graph
-  _nlist(arena(), vloop().estimated_body_length(), 0, nullptr), // scratch list of nodes
   _race_possible(false),                                    // cases where SDMU is true
   _do_vector_loop(phase()->C->do_vector_loop()),            // whether to do vectorization/simd style
   _num_work_vecs(0),                                        // amount of vector work we have
@@ -456,9 +453,6 @@ bool SuperWord::SLP_extract() {
 
   // TODO remove all the VLoopAnalyzer stuff
 
-  // Find memory slices
-  find_memory_slices();
-
   // Ready the block
   if (!construct_bb()) {
 #ifndef PRODUCT
@@ -785,16 +779,22 @@ void SuperWord::dependence_graph() {
     }
   }
 
+  const GrowableArray<PhiNode*> &mem_slice_head = vloop_analyzer().memory_slices().heads();
+  const GrowableArray<MemNode*> &mem_slice_tail = vloop_analyzer().memory_slices().tails();
+
+  ResourceMark rm;
+  GrowableArray<Node*> slice_nodes;
+
   // For each memory slice, create the dependences
-  for (int i = 0; i < _mem_slice_head.length(); i++) {
-    Node* n      = _mem_slice_head.at(i);
-    Node* n_tail = _mem_slice_tail.at(i);
+  for (int i = 0; i < mem_slice_head.length(); i++) {
+    PhiNode* head = mem_slice_head.at(i);
+    MemNode* tail = mem_slice_tail.at(i);
 
     // Get slice in predecessor order (last is first)
-    mem_slice_preds(n_tail, n, _nlist);
+    vloop_analyzer().memory_slices().get_slice(head, tail, slice_nodes);
 
     // Make the slice dependent on the root
-    DepMem* slice = _dg.dep(n);
+    DepMem* slice = _dg.dep(head);
     _dg.make_edge(_dg.root(), slice);
 
     // Create a sink for the slice
@@ -802,8 +802,8 @@ void SuperWord::dependence_graph() {
     _dg.make_edge(slice_sink, _dg.tail());
 
     // Now visit each pair of memory ops, creating the edges
-    for (int j = _nlist.length() - 1; j >= 0 ; j--) {
-      Node* s1 = _nlist.at(j);
+    for (int j = slice_nodes.length() - 1; j >= 0 ; j--) {
+      Node* s1 = slice_nodes.at(j);
 
       // If no dependency yet, use slice
       if (_dg.dep(s1)->in_cnt() == 0) {
@@ -812,7 +812,7 @@ void SuperWord::dependence_graph() {
       VPointer p1(s1->as_Mem(), vloop());
       bool sink_dependent = true;
       for (int k = j - 1; k >= 0; k--) {
-        Node* s2 = _nlist.at(k);
+        Node* s2 = slice_nodes.at(k);
         if (s1->is_Load() && s2->is_Load())
           continue;
         VPointer p2(s2->as_Mem(), vloop());
@@ -831,68 +831,68 @@ void SuperWord::dependence_graph() {
 
 #ifndef PRODUCT
     if (is_trace_superword_dependence_graph()) {
-      tty->print_cr("\nDependence graph for slice: %d", n->_idx);
-      for (int q = 0; q < _nlist.length(); q++) {
-        _dg.print(_nlist.at(q));
+      tty->print_cr("\nDependence graph for slice: %d", head->_idx);
+      for (int q = 0; q < slice_nodes.length(); q++) {
+        _dg.print(slice_nodes.at(q));
       }
       tty->cr();
     }
 #endif
 
-    _nlist.clear();
+    slice_nodes.clear();
   }
 }
 
-void SuperWord::find_memory_slices() {
-  assert(_mem_slice_head.length() == 0, "mem_slice_head is empty");
-  assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty");
+void VLoopMemorySlices::find_memory_slices() {
+  assert(_heads.is_empty(), "not yet computed");
+  assert(_tails.is_empty(), "not yet computed");
+  CountedLoopNode* cl = vloop().cl();
 
   // Iterate over all memory phis
-  for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) {
-    PhiNode* phi = cl()->fast_out(i)->isa_Phi();
-    if (phi != nullptr && in_bb(phi) && phi->is_memory_phi()) {
+  for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
+    PhiNode* phi = cl->fast_out(i)->isa_Phi();
+    if (phi != nullptr && vloop().in_bb(phi) && phi->is_memory_phi()) {
       Node* phi_tail = phi->in(LoopNode::LoopBackControl);
       if (phi_tail != phi->in(LoopNode::EntryControl)) {
-        _mem_slice_head.push(phi);
-        _mem_slice_tail.push(phi_tail->as_Mem());
+        _heads.push(phi);
+        _tails.push(phi_tail->as_Mem());
       }
     }
   }
 
-  NOT_PRODUCT( if (is_trace_superword_memory_slices()) { print_memory_slices(); } )
+  NOT_PRODUCT( if (vloop().is_trace_memory_slices()) { print(); } )
 }
 
 #ifndef PRODUCT
-void SuperWord::print_memory_slices() {
-  tty->print_cr("\nSuperWord::print_memory_slices: %s",
-                _mem_slice_head.length() > 0 ? "" : "NONE");
-  for (int m = 0; m < _mem_slice_head.length(); m++) {
-    tty->print("%6d ", m);  _mem_slice_head.at(m)->dump();
-    tty->print("       ");  _mem_slice_tail.at(m)->dump();
+void VLoopMemorySlices::print() const {
+  tty->print_cr("\nVLoopMemorySlices::print: %s",
+                heads().length() > 0 ? "" : "NONE");
+  for (int m = 0; m < heads().length(); m++) {
+    tty->print("%6d ", m);  heads().at(m)->dump();
+    tty->print("       ");  tails().at(m)->dump();
   }
 }
 #endif
 
-//---------------------------mem_slice_preds---------------------------
-// Return a memory slice (node list) in predecessor order starting at "start"
-void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds) {
-  assert(preds.length() == 0, "start empty");
-  Node* n = start;
+// Get all memory nodes of a slice, in reverse order
+void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const {
+  assert(slice.length() == 0, "start empty");
+  Node* n = tail;
   Node* prev = nullptr;
   while (true) {
-    assert(in_bb(n), "must be in block");
+    assert(vloop().in_bb(n), "must be in block");
     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
       Node* out = n->fast_out(i);
       if (out->is_Load()) {
-        if (in_bb(out)) {
-          preds.push(out);
+        if (vloop().in_bb(out)) {
+          slice.push(out);
         }
       } else {
         // FIXME
-        if (out->is_MergeMem() && !in_bb(out)) {
+        if (out->is_MergeMem() && !vloop().in_bb(out)) {
           // Either unrolling is causing a memory edge not to disappear,
           // or need to run igvn.optimize() again before SLP
-        } else if (out->is_memory_phi() && !in_bb(out)) {
+        } else if (out->is_memory_phi() && !vloop().in_bb(out)) {
           // Ditto.  Not sure what else to check further.
         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
           // StoreCM has an input edge used as a precedence edge.
@@ -902,19 +902,19 @@ void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &p
         }
       }//else
     }//for
-    if (n == stop) break;
-    preds.push(n);
+    if (n == head) { break; }
+    slice.push(n);
     prev = n;
     assert(n->is_Mem(), "unexpected node %s", n->Name());
     n = n->in(MemNode::Memory);
   }
 
 #ifndef PRODUCT
-  if (is_trace_superword_memory_slices()) {
-    tty->print_cr("\nSuperWord::mem_slice_preds:");
-    stop->dump();
-    for (int j = preds.length() - 1; j >= 0 ; j--) {
-      preds.at(j)->dump();
+  if (vloop().is_trace_memory_slices()) {
+    tty->print_cr("\nVLoopMemorySlices::get_slice:");
+    head->dump();
+    for (int j = slice.length() - 1; j >= 0 ; j--) {
+      slice.at(j)->dump();
     }
   }
 #endif
@@ -2262,9 +2262,11 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
   // loop we may have a different last store, and we need to adjust the uses accordingly.
   GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
 
+  const GrowableArray<PhiNode*> &mem_slice_head = vloop_analyzer().memory_slices().heads();
+
   // (1) Set up the initial memory state from Phi. And find the old last store.
-  for (int i = 0; i < _mem_slice_head.length(); i++) {
-    Node* phi  = _mem_slice_head.at(i);
+  for (int i = 0; i < mem_slice_head.length(); i++) {
+    Node* phi  = mem_slice_head.at(i);
     assert(phi->is_Phi(), "must be phi");
     int alias_idx = phase()->C->get_alias_index(phi->adr_type());
     current_state_in_slice.at_put(alias_idx, phi);
@@ -2299,8 +2301,8 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
   //     in the Phi. Further, we replace uses of the old last store
   //     with uses of the new last store (current_state).
   Node_List uses_after_loop;
-  for (int i = 0; i < _mem_slice_head.length(); i++) {
-    Node* phi  = _mem_slice_head.at(i);
+  for (int i = 0; i < mem_slice_head.length(); i++) {
+    Node* phi  = mem_slice_head.at(i);
     int alias_idx = phase()->C->get_alias_index(phi->adr_type());
     Node* current_state = current_state_in_slice.at(alias_idx);
     assert(current_state != nullptr, "slice is mapped");
@@ -3274,8 +3276,9 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) {
   return vt1 == vt2;
 }
 
-bool SuperWord::same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const {
-  return phase()->C->get_alias_index(mem_ref->adr_type()) == phase()->C->get_alias_index(best_align_to_mem_ref->adr_type());
+bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
+  return vloop().phase()->C->get_alias_index(m1->adr_type()) ==
+         vloop().phase()->C->get_alias_index(m2->adr_type());
 }
 
 //------------------------------in_packset---------------------------
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index 06ba6653d2766..db890e5539250 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -211,17 +211,12 @@ class SuperWord : public ResourceObj {
   GrowableArray<int> &_bb_idx;           // Map from Node _idx to index within block
 
   GrowableArray<Node*> _block;           // Nodes in current block
-  GrowableArray<PhiNode*> _mem_slice_head; // Memory slice head nodes
-  GrowableArray<MemNode*> _mem_slice_tail; // Memory slice tail nodes
   GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
   CloneMap&            _clone_map;       // map of nodes created in cloning
   MemNode const* _align_to_ref;          // Memory reference that pre-loop will align to
 
   DepGraph _dg; // Dependence graph
 
-  // Scratch pads
-  GrowableArray<Node*> _nlist; // List of nodes
-
  public:
   SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared);
 
@@ -248,8 +243,14 @@ class SuperWord : public ResourceObj {
   bool is_marked_reduction(const Node* n) const {
     return vloop_analyzer().reductions().is_marked_reduction(n);
   }
-  bool reduction(Node* s1, Node* s2) const {
-    return vloop_analyzer().reductions().is_marked_reduction_pair(s1, s2);
+
+  bool reduction(Node* n1, Node* n2) const {
+    return vloop_analyzer().reductions().is_marked_reduction_pair(n1, n2);
+  }
+
+  // VLoopMemorySlices Accessors
+  bool same_memory_slice(MemNode* n1, MemNode* n2) const {
+    return vloop_analyzer().memory_slices().same_memory_slice(n1, n2);
   }
 
 #ifndef PRODUCT
@@ -264,11 +265,6 @@ class SuperWord : public ResourceObj {
     return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
   }
 
-  bool is_trace_superword_memory_slices() const {
-    return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES);
-  }
-
   bool is_trace_superword_dependence_graph() const {
     return TraceSuperWord ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
@@ -304,7 +300,6 @@ class SuperWord : public ResourceObj {
            is_trace_align_vector() ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
@@ -370,7 +365,6 @@ class SuperWord : public ResourceObj {
   BasicType velt_basic_type(const Node* n) const { return velt_type(n)->array_element_basic_type(); }
   void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; }
   bool same_velt_type(Node* n1, Node* n2);
-  bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const;
 
   // my_pack
  public:
@@ -396,12 +390,6 @@ class SuperWord : public ResourceObj {
   // Construct dependency graph.
   void dependence_graph();
 
-  // Analyze the memory slices
-  void find_memory_slices();
-  NOT_PRODUCT( void print_memory_slices(); )
-  // Return a memory slice (node list) in predecessor order starting at "start"
-  void mem_slice_preds(Node* start, Node* stop, GrowableArray<Node*> &preds);
-
   // Can s1 and s2 be in a pack with s1 immediately preceding s2 and  s1 aligned at "align"
   bool stmts_can_pack(Node* s1, Node* s2, int align);
   // Does s exist in a pack at position pos?
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index 78f1301010aae..c7576e7343dfd 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -32,9 +32,9 @@
   flags(POINTER_ANALYSIS,     "Trace VPointer") \
   flags(PRECONDITIONS,        "Trace VLoop::check_preconditions") \
   flags(LOOP_ANALYZER,        "Trace VLoopAnalyzer::setup_submodules") \
+  flags(MEMORY_SLICES,        "Trace VLoopMemorySlices") \
   flags(SW_TYPES,             "Trace SuperWord::compute_vector_element_type") \
   flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
-  flags(SW_MEMORY_SLICES,     "Trace SuperWord memory slices") \
   flags(SW_DEPENDENCE_GRAPH,  "Trace SuperWord::dependence_graph") \
   flags(SW_ADJACENT_MEMOPS,   "Trace SuperWord::find_adjacent_refs") \
   flags(SW_REJECTIONS,        "Trace SuperWord rejections (non vectorizations)") \
@@ -115,7 +115,6 @@ class TraceAutoVectorizationTagValidator {
       } else if (SW_VERBOSE == tag) {
         _tags.at_put(SW_TYPES, set_bit);
         _tags.at_put(SW_ALIGNMENT, set_bit);
-        _tags.at_put(SW_MEMORY_SLICES, set_bit);
         _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
         _tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
         _tags.at_put(SW_REJECTIONS, set_bit);
@@ -123,7 +122,6 @@ class TraceAutoVectorizationTagValidator {
         _tags.at_put(SW_INFO, set_bit);
         _tags.at_put(SW_VERBOSE, set_bit);
       } else if (SW_INFO == tag) {
-        _tags.at_put(SW_MEMORY_SLICES, set_bit);
         _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
         _tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
         _tags.at_put(SW_REJECTIONS, set_bit);
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index a1369da7f4df3..56350b3a0a309 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -148,15 +148,15 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
     _reductions.mark_reductions();
   }
 
-  // TODO _memory_slices.analyze();
-
-  // // If there is no memory slice detected, that means there is no store.
-  // // If there is no reduction and no store, then we give up, because
-  // // vectorization is not possible anyway (given current limitations).
-  // if (!reductions().is_marked_reduction_loop() &&
-  //     _memory_slices.heads().is_empty()) {
-  //   return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE;
-  // }
+  _memory_slices.find_memory_slices();
+
+  // If there is no memory slice detected, that means there is no store.
+  // If there is no reduction and no store, then we give up, because
+  // vectorization is not possible anyway (given current limitations).
+  if (!reductions().is_marked_reduction_loop() &&
+      _memory_slices.heads().is_empty()) {
+    return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE;
+  }
 
   // TODO
   return VLoopAnalyzer::SUCCESS;
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 30c386465fd9f..1ade6011b2e1d 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -116,6 +116,10 @@ class VLoop : public StackObj {
     return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER);
   }
 
+  bool is_trace_memory_slices() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES);
+  }
+
   bool is_trace_pointer_analysis() const {
     return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
   }
@@ -189,7 +193,6 @@ class VLoopReductions : public StackObj {
 
 private:
   const VLoop& vloop() const { return _vloop; }
-
   // Search for a path P = (n_1, n_2, ..., n_k) such that:
   // - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
   // - path(n) for all n in P,
@@ -236,6 +239,7 @@ class VLoopReductions : public StackObj {
   bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); }
   // Are s1 and s2 reductions with a data path between them?
   bool is_marked_reduction_pair(Node* s1, Node* s2) const;
+
 private:
   // Whether n is a standard reduction operator.
   static bool is_reduction_operator(const Node* n);
@@ -247,6 +251,39 @@ class VLoopReductions : public StackObj {
   static Node* original_input(const Node* n, uint i);
 };
 
+// Submodule of VLoopAnalyzer.
+// Find the memory slices in the loop.
+class VLoopMemorySlices : public StackObj {
+private:
+  const VLoop& _vloop;
+
+  GrowableArray<PhiNode*> _heads;
+  GrowableArray<MemNode*> _tails;
+
+  const VLoop& vloop() const { return _vloop; }
+
+public:
+  VLoopMemorySlices(Arena* arena, const VLoop& vloop) :
+    _vloop(vloop),
+    _heads(arena, 8, 0, nullptr),
+    _tails(arena, 8, 0, nullptr) {};
+  NONCOPYABLE(VLoopMemorySlices);
+
+  void find_memory_slices();
+
+  const GrowableArray<PhiNode*> &heads() const { return _heads; }
+  const GrowableArray<MemNode*> &tails() const { return _tails; }
+
+  // Get all memory nodes of a slice, in reverse order
+  void get_slice(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const;
+
+  bool same_memory_slice(MemNode* m1, MemNode* m2) const;
+
+#ifndef PRODUCT
+  void print() const;
+#endif
+};
+
 // Analyze the loop in preparation for auto-vectorization. This class is
 // deliberately structured into many submodules, which are as independent
 // as possible, though some submodules do require other submodules.
@@ -269,13 +306,21 @@ class VLoopAnalyzer : StackObj {
   // Submodules
   // TODO
   VLoopReductions            _reductions;
+  VLoopMemorySlices    _memory_slices;
+  //VLoopBody            _body;
+  //VLoopTypes           _types;
+  //VLoopDependenceGraph _dependence_graph;
 
 public:
   VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) :
     _vloop(vloop),
     _arena(mtCompiler),
     _success(false),
-    _reductions      (&_arena, vloop)
+    _reductions      (&_arena, vloop),
+    _memory_slices   (&_arena, vloop)
+    //_body            (&_arena, vloop),
+    //_types           (&_arena, vloop, body()),
+    //_dependence_graph(&_arena, vloop, memory_slices(), body())
     // TODO modules
   {
     _success = setup_submodules();
@@ -289,6 +334,10 @@ class VLoopAnalyzer : StackObj {
   // Read-only accessors for submodules
   const VLoop& vloop()                           const { return _vloop; }
   const VLoopReductions& reductions()            const { return _reductions; }
+  const VLoopMemorySlices& memory_slices()       const { return _memory_slices; }
+  //const VLoopBody& body()                        const { return _body; }
+  //const VLoopTypes& types()                      const { return _types; }
+  //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; }
   // TODO
 
 private:

From 8e4377409465c2b62c358071bf37390c787fc822 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 22:03:16 +0100
Subject: [PATCH 04/13] VLoopBody

---
 src/hotspot/share/opto/loopopts.cpp           |   2 +-
 src/hotspot/share/opto/superword.cpp          | 144 ++++++++----------
 src/hotspot/share/opto/superword.hpp          |  24 ++-
 .../share/opto/traceAutoVectorizationTag.hpp  |   1 +
 src/hotspot/share/opto/vectorization.cpp      |   5 +
 src/hotspot/share/opto/vectorization.hpp      |  54 ++++++-
 6 files changed, 131 insertions(+), 99 deletions(-)

diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp
index 406158eaee42f..ec16053e6bd45 100644
--- a/src/hotspot/share/opto/loopopts.cpp
+++ b/src/hotspot/share/opto/loopopts.cpp
@@ -4237,7 +4237,7 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) {
     return AutoVectorizeStatus::TriedAndFailed;
   }
 
-  SuperWord sw(vloop_analyzer, vshared);
+  SuperWord sw(vloop_analyzer);
   if (!sw.transform_loop()) {
     return AutoVectorizeStatus::TriedAndFailed;
   }
diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index a0d56f08a8124..15d13358c9be7 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -38,12 +38,10 @@
 #include "opto/movenode.hpp"
 #include "utilities/powerOfTwo.hpp"
 
-SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared) :
+SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
   _vloop_analyzer(vloop_analyzer),
   _arena(mtCompiler),
   _packset(arena(), 8,  0, nullptr),                        // packs for the current block
-  _bb_idx(vshared.node_idx_to_loop_body_idx()),             // node idx to index in bb
-  _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block
   _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
   _align_to_ref(nullptr),                                   // memory reference to align vectors to
@@ -453,17 +451,6 @@ bool SuperWord::SLP_extract() {
 
   // TODO remove all the VLoopAnalyzer stuff
 
-  // Ready the block
-  if (!construct_bb()) {
-#ifndef PRODUCT
-    if (is_trace_superword_any()) {
-      tty->print_cr("\nSuperWord::construct_bb failed: abort SuperWord");
-      tty->cr();
-    }
-#endif
-    return false;
-  }
-
   // Ensure extra info is allocated.
   initialize_node_info();
 
@@ -514,8 +501,8 @@ bool SuperWord::SLP_extract() {
 void SuperWord::find_adjacent_refs() {
   // Get list of memory operations
   Node_List memops;
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) &&
         is_java_primitive(n->as_Mem()->memory_type())) {
       int align = memory_alignment(n->as_Mem(), 0);
@@ -772,8 +759,8 @@ void SuperWord::dependence_graph() {
   assert(cl->is_main_loop(), "SLP should only work on main loops");
 
   // First, assign a dependence node to each memory node
-  for (int i = 0; i < _block.length(); i++ ) {
-    Node *n = _block.at(i);
+  for (int i = 0; i < body().length(); i++ ) {
+    Node *n = body().at(i);
     if (n->is_Mem() || n->is_memory_phi()) {
       _dg.make_node(n);
     }
@@ -1961,8 +1948,8 @@ void SuperWord::verify_packs() {
   }
 
   // Check that no other node has my_pack set.
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     if (!processed.member(n)) {
       assert(my_pack(n) == nullptr, "should not have pack if not in packset");
     }
@@ -2045,9 +2032,9 @@ class PacksetGraph {
 
   // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds.
   void build() {
-    const GrowableArray<Node_List*> &packset = _slp->packset();
-    const GrowableArray<Node*> &block = _slp->block();
-    const DepGraph &dg = _slp->dg();
+    const GrowableArray<Node_List*>& packset = _slp->packset();
+    const GrowableArray<Node*>& body = _slp->body();
+    const DepGraph& dg = _slp->dg();
     // Map nodes in packsets
     for (int i = 0; i < packset.length(); i++) {
       Node_List* p = packset.at(i);
@@ -2062,8 +2049,8 @@ class PacksetGraph {
     int max_pid_packset = _max_pid;
 
     // Map nodes not in packset
-    for (int i = 0; i < block.length(); i++) {
-      Node* n = block.at(i);
+    for (int i = 0; i < body.length(); i++) {
+      Node* n = body.at(i);
       if (n->is_Phi() || n->is_CFG()) {
         continue; // ignore control flow
       }
@@ -2090,7 +2077,7 @@ class PacksetGraph {
           if (pred_pid == pid && _slp->is_marked_reduction(n)) {
             continue; // reduction -> self-cycle is not a cyclic dependency
           }
-          // Only add edges once, and only for mapped nodes (in block)
+          // Only add edges once, and only for mapped nodes (in body)
           if (pred_pid > 0 && !set.test_set(pred_pid)) {
             incnt_set(pid, incnt(pid) + 1); // increment
             out(pred_pid).push(pid);
@@ -2100,8 +2087,8 @@ class PacksetGraph {
     }
 
     // Map edges for nodes not in packset
-    for (int i = 0; i < block.length(); i++) {
-      Node* n = block.at(i);
+    for (int i = 0; i < body.length(); i++) {
+      Node* n = body.at(i);
       int pid = get_pid_or_zero(n); // zero for Phi or CFG
       if (pid <= max_pid_packset) {
         continue; // Only scalar-nodes
@@ -2109,7 +2096,7 @@ class PacksetGraph {
       for (DepPreds preds(n, dg); !preds.done(); preds.next()) {
         Node* pred = preds.current();
         int pred_pid = get_pid_or_zero(pred);
-        // Only add edges for mapped nodes (in block)
+        // Only add edges for mapped nodes (in body)
         if (pred_pid > 0) {
           incnt_set(pid, incnt(pid) + 1); // increment
           out(pred_pid).push(pid);
@@ -2170,7 +2157,7 @@ class PacksetGraph {
   // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node.
   // print_zero_incnt = false: do not print nodes that have no in-edges (any more).
   void print(bool print_nodes, bool print_zero_incnt) {
-    const GrowableArray<Node*> &block = _slp->block();
+    const GrowableArray<Node*> &body = _slp->body();
     tty->print_cr("PacksetGraph");
     for (int pid = 1; pid <= _max_pid; pid++) {
       if (incnt(pid) == 0 && !print_zero_incnt) {
@@ -2183,8 +2170,8 @@ class PacksetGraph {
       tty->print_cr("]");
 #ifndef PRODUCT
       if (print_nodes) {
-        for (int i = 0; i < block.length(); i++) {
-          Node* n = block.at(i);
+        for (int i = 0; i < body.length(); i++) {
+          Node* n = body.at(i);
           if (get_pid_or_zero(n) == pid) {
             tty->print("    ");
             n->dump();
@@ -2364,8 +2351,8 @@ bool SuperWord::output() {
   uint max_vlen_in_bytes = 0;
   uint max_vlen = 0;
 
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     Node_List* p = my_pack(n);
     if (p != nullptr && n == p->at(p->size()-1)) {
       // After schedule_reorder_memops, we know that the memops have the same order in the pack
@@ -2637,7 +2624,6 @@ bool SuperWord::output() {
       }
 #endif
 
-      _block.at_put(i, vn);
       igvn().register_new_node_with_optimizer(vn);
       phase()->set_ctrl(vn, phase()->get_ctrl(first));
       for (uint j = 0; j < p->size(); j++) {
@@ -2654,7 +2640,7 @@ bool SuperWord::output() {
       }
       VectorNode::trace_new_vector(vn, "SuperWord");
     }
-  }//for (int i = 0; i < _block.length(); i++)
+  }//for (int i = 0; i < body().length(); i++)
 
   if (max_vlen_in_bytes > C->max_vector_size()) {
     C->set_max_vector_size(max_vlen_in_bytes);
@@ -2918,33 +2904,32 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
   return true;
 }
 
-//------------------------------construct_bb---------------------------
-// Construct reverse postorder list of block members
-bool SuperWord::construct_bb() {
-  assert(_block.length() == 0,          "block is empty");
+// Return nullptr if success, else failure message
+const char* VLoopBody::construct() {
+  assert(_body.length() == 0, "body is empty");
 
   // First pass over loop body:
   //  (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj).
   //  (2) Count number of nodes, and create a temporary map (_idx -> bb_idx).
   //  (3) Verify that all non-ctrl nodes have an input inside the loop.
-  int block_count = 0;
-  for (uint i = 0; i < lpt()->_body.size(); i++) {
-    Node* n = lpt()->_body.at(i);
+  int body_count = 0;
+  for (uint i = 0; i < vloop().lpt()->_body.size(); i++) {
+    Node* n = vloop().lpt()->_body.at(i);
     set_bb_idx(n, i); // Create a temporary map
-    if (in_bb(n)) {
-      block_count++;
+    if (vloop().in_bb(n)) {
+      body_count++;
 
       if (n->is_LoadStore() || n->is_MergeMem() ||
           (n->is_Proj() && !n->as_Proj()->is_CFG())) {
         // Bailout if the loop has LoadStore, MergeMem or data Proj
         // nodes. Superword optimization does not work with them.
 #ifndef PRODUCT
-        if (is_trace_superword_any()) {
-          tty->print_cr("SuperWord::construct_bb: fails because of unhandled node:");
+        if (vloop().is_trace_body()) {
+          tty->print_cr("VLoopBody::construct: fails because of unhandled node:");
           n->dump();
         }
 #endif
-        return false;
+        return VLoopBody::FAILURE_NODE_NOT_ALLOWED;
       }
 
 #ifdef ASSERT
@@ -2952,7 +2937,7 @@ bool SuperWord::construct_bb() {
         bool found = false;
         for (uint j = 0; j < n->req(); j++) {
           Node* def = n->in(j);
-          if (def != nullptr && in_bb(def)) {
+          if (def != nullptr && vloop().in_bb(def)) {
             found = true;
             break;
           }
@@ -2963,17 +2948,17 @@ bool SuperWord::construct_bb() {
     }
   }
 
-  // Create a reverse-post-order list of nodes in block
+  // Create a reverse-post-order list of nodes in body
   ResourceMark rm;
   GrowableArray<Node*> stack;
   VectorSet visited;
   VectorSet post_visited;
 
-  visited.set(bb_idx(cl()));
-  stack.push(cl());
+  visited.set(bb_idx(vloop().cl()));
+  stack.push(vloop().cl());
 
   // Do a depth first walk over out edges
-  int rpo_idx = block_count - 1;
+  int rpo_idx = body_count - 1;
   while (!stack.is_empty()) {
     Node* n = stack.top(); // Leave node on stack
     if (!visited.test_set(bb_idx(n))) {
@@ -2983,9 +2968,9 @@ bool SuperWord::construct_bb() {
       const int old_length = stack.length();
       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
         Node* use = n->fast_out(i);
-        if (in_bb(use) && !visited.test(bb_idx(use)) &&
+        if (vloop().in_bb(use) && !visited.test(bb_idx(use)) &&
             // Don't go around backedge
-            (!use->is_Phi() || n == cl())) {
+            (!use->is_Phi() || n == vloop().cl())) {
           stack.push(use);
         }
       }
@@ -2993,7 +2978,7 @@ bool SuperWord::construct_bb() {
         // There were no additional uses, post visit node now
         stack.pop(); // Remove node from stack
         assert(rpo_idx >= 0, "must still have idx to pass out");
-        _block.at_put_grow(rpo_idx, n);
+        _body.at_put_grow(rpo_idx, n);
         rpo_idx--;
         post_visited.set(bb_idx(n));
         assert(rpo_idx >= 0 || stack.is_empty(), "still have idx left or are finished");
@@ -3003,25 +2988,25 @@ bool SuperWord::construct_bb() {
     }
   }
 
-  // Create real map of block indices for nodes
-  for (int j = 0; j < _block.length(); j++) {
-    Node* n = _block.at(j);
+  // Create real map of body indices for nodes
+  for (int j = 0; j < _body.length(); j++) {
+    Node* n = _body.at(j);
     set_bb_idx(n, j);
   }
 
 #ifndef PRODUCT
-  if (is_trace_superword_info()) {
-    print_bb();
+  if (vloop().is_trace_body()) {
+    print();
   }
 #endif
 
-  assert(rpo_idx == -1 && block_count == _block.length(), "all block members found");
-  return true;
+  assert(rpo_idx == -1 && body_count == _body.length(), "all body members found");
+  return nullptr; // success
 }
 
 // Initialize per node info
 void SuperWord::initialize_node_info() {
-  Node* last = _block.at(_block.length() - 1);
+  Node* last = body().at(body().length() - 1);
   grow_node_info(bb_idx(last));
 }
 
@@ -3033,8 +3018,8 @@ void SuperWord::compute_max_depth() {
   bool again;
   do {
     again = false;
-    for (int i = 0; i < _block.length(); i++) {
-      Node* n = _block.at(i);
+    for (int i = 0; i < body().length(); i++) {
+      Node* n = body().at(i);
       if (!n->is_Phi()) {
         int d_orig = depth(n);
         int d_in   = 0;
@@ -3124,15 +3109,15 @@ void SuperWord::compute_vector_element_type() {
 #endif
 
   // Initial type
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     set_velt_type(n, container_type(n));
   }
 
   // Propagate integer narrowed type backwards through operations
   // that don't depend on higher order bits
-  for (int i = _block.length() - 1; i >= 0; i--) {
-    Node* n = _block.at(i);
+  for (int i = body().length() - 1; i >= 0; i--) {
+    Node* n = body().at(i);
     // Only integer types need be examined
     const Type* vtn = velt_type(n);
     if (vtn->basic_type() == T_INT) {
@@ -3180,8 +3165,8 @@ void SuperWord::compute_vector_element_type() {
       }
     }
   }
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     Node* nn = n;
     if (nn->is_Bool() && nn->in(0) == nullptr) {
       nn = nn->in(1);
@@ -3198,8 +3183,8 @@ void SuperWord::compute_vector_element_type() {
   }
 #ifndef PRODUCT
   if (is_trace_superword_vector_element_type()) {
-    for (int i = 0; i < _block.length(); i++) {
-      Node* n = _block.at(i);
+    for (int i = 0; i < body().length(); i++) {
+      Node* n = body().at(i);
       velt_type(n)->dump();
       tty->print("\t");
       n->dump();
@@ -3699,19 +3684,18 @@ void SuperWord::print_pack(Node_List* p) {
   }
 }
 
-//------------------------------print_bb---------------------------
-void SuperWord::print_bb() {
 #ifndef PRODUCT
+void VLoopBody::print() const {
   tty->print_cr("\nBlock");
-  for (int i = 0; i < _block.length(); i++) {
-    Node* n = _block.at(i);
+  for (int i = 0; i < body().length(); i++) {
+    Node* n = body().at(i);
     tty->print("%d ", i);
-    if (n) {
+    if (n != nullptr) {
       n->dump();
     }
   }
-#endif
 }
+#endif
 
 //------------------------------print_stmt---------------------------
 void SuperWord::print_stmt(Node* s) {
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index db890e5539250..fa8563112bba5 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -208,9 +208,6 @@ class SuperWord : public ResourceObj {
 
   GrowableArray<Node_List*> _packset;    // Packs for the current block
 
-  GrowableArray<int> &_bb_idx;           // Map from Node _idx to index within block
-
-  GrowableArray<Node*> _block;           // Nodes in current block
   GrowableArray<SWNodeInfo> _node_info;  // Info needed per node
   CloneMap&            _clone_map;       // map of nodes created in cloning
   MemNode const* _align_to_ref;          // Memory reference that pre-loop will align to
@@ -218,7 +215,7 @@ class SuperWord : public ResourceObj {
   DepGraph _dg; // Dependence graph
 
  public:
-  SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared);
+  SuperWord(const VLoopAnalyzer &vloop_analyzer);
 
   // Attempt to run the SuperWord algorithm on the loop. Return true if we succeed.
   bool transform_loop();
@@ -253,6 +250,15 @@ class SuperWord : public ResourceObj {
     return vloop_analyzer().memory_slices().same_memory_slice(n1, n2);
   }
 
+  // VLoopAnalyzer body
+  const GrowableArray<Node*>& body() const {
+    return vloop_analyzer().body().body();
+  }
+
+  int bb_idx(const Node* n) const     {
+    return vloop_analyzer().body().bb_idx(n);
+  }
+
 #ifndef PRODUCT
   // TraceAutoVectorization and TraceSuperWord
   bool is_trace_superword_vector_element_type() const {
@@ -317,7 +323,6 @@ class SuperWord : public ResourceObj {
   bool     do_vector_loop()        { return _do_vector_loop; }
 
   const GrowableArray<Node_List*>& packset() const { return _packset; }
-  const GrowableArray<Node*>&      block()   const { return _block; }
   const DepGraph&                  dg()      const { return _dg; }
  private:
   bool           _race_possible;   // In cases where SDMU is true
@@ -340,12 +345,6 @@ class SuperWord : public ResourceObj {
   const MemNode* align_to_ref() const { return _align_to_ref; }
   void set_align_to_ref(const MemNode* m) { _align_to_ref = m; }
 
-  // block accessors
- public:
-  int  bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); }
- private:
-  void set_bb_idx(Node* n, int i)  { _bb_idx.at_put_grow(n->_idx, i); }
-
   // Ensure node_info contains element "i"
   void grow_node_info(int i) { if (i >= _node_info.length()) _node_info.at_put_grow(i, SWNodeInfo::initial); }
 
@@ -453,8 +452,6 @@ class SuperWord : public ResourceObj {
   DEBUG_ONLY(void verify_no_extract();)
   // Is use->in(u_idx) a vector use?
   bool is_vector_use(Node* use, int u_idx);
-  // Construct reverse postorder list of block members
-  bool construct_bb();
   // Initialize per node info
   void initialize_node_info();
   // Compute max depth for expressions from beginning of block
@@ -482,7 +479,6 @@ class SuperWord : public ResourceObj {
   // print methods
   void print_packset();
   void print_pack(Node_List* p);
-  void print_bb();
   void print_stmt(Node* s);
 
   void packset_sort(int n);
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index c7576e7343dfd..5121634285b90 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -33,6 +33,7 @@
   flags(PRECONDITIONS,        "Trace VLoop::check_preconditions") \
   flags(LOOP_ANALYZER,        "Trace VLoopAnalyzer::setup_submodules") \
   flags(MEMORY_SLICES,        "Trace VLoopMemorySlices") \
+  flags(BODY,                 "Trace VLoopBody") \
   flags(SW_TYPES,             "Trace SuperWord::compute_vector_element_type") \
   flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
   flags(SW_DEPENDENCE_GRAPH,  "Trace SuperWord::dependence_graph") \
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 56350b3a0a309..d90dea194ecf8 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -158,6 +158,11 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
     return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE;
   }
 
+  const char* body_failure = _body.construct();
+  if (body_failure != nullptr) {
+    return body_failure;
+  }
+
   // TODO
   return VLoopAnalyzer::SUCCESS;
 }
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 1ade6011b2e1d..0ae2bb7a12da9 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -120,6 +120,10 @@ class VLoop : public StackObj {
     return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES);
   }
 
+  bool is_trace_body() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::BODY);
+  }
+
   bool is_trace_pointer_analysis() const {
     return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
   }
@@ -284,6 +288,48 @@ class VLoopMemorySlices : public StackObj {
 #endif
 };
 
+// Submodule of VLoopAnalyzer.
+// Finds all nodes in the body, and creates a mapping node->_idx to a body_idx.
+// This mapping is used so that subsequent datastructures sizes only grow with
+// the body size, and not the number of all nodes in the compilation.
+class VLoopBody : public StackObj {
+private:
+  static constexpr char const* FAILURE_NODE_NOT_ALLOWED = "encontered unhandled node";
+
+  const VLoop& _vloop;
+
+  // Mapping body_idx -> Node*
+  GrowableArray<Node*> _body;
+
+  // Mapping node->_idx -> body_idx
+  // Can be very large, and thus lives in VSharedData
+  GrowableArray<int>&  _body_idx;
+
+  const VLoop& vloop() const { return _vloop; }
+
+public:
+  VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) :
+    _vloop(vloop),
+    _body(arena, vloop.estimated_body_length(), 0, nullptr),
+    _body_idx(vshared.node_idx_to_loop_body_idx()) {}
+
+  NONCOPYABLE(VLoopBody);
+
+  const char* construct();
+  const GrowableArray<Node*>& body() const { return _body; }
+  NOT_PRODUCT( void print() const; )
+
+  int bb_idx(const Node* n) const {
+    assert(_vloop.in_bb(n), "must be in basic block");
+    return _body_idx.at(n->_idx);
+  }
+
+private:
+  void set_bb_idx(Node* n, int i) {
+    _body_idx.at_put_grow(n->_idx, i);
+  }
+};
+
 // Analyze the loop in preparation for auto-vectorization. This class is
 // deliberately structured into many submodules, which are as independent
 // as possible, though some submodules do require other submodules.
@@ -307,7 +353,7 @@ class VLoopAnalyzer : StackObj {
   // TODO
   VLoopReductions            _reductions;
   VLoopMemorySlices    _memory_slices;
-  //VLoopBody            _body;
+  VLoopBody            _body;
   //VLoopTypes           _types;
   //VLoopDependenceGraph _dependence_graph;
 
@@ -317,8 +363,8 @@ class VLoopAnalyzer : StackObj {
     _arena(mtCompiler),
     _success(false),
     _reductions      (&_arena, vloop),
-    _memory_slices   (&_arena, vloop)
-    //_body            (&_arena, vloop),
+    _memory_slices   (&_arena, vloop),
+    _body            (&_arena, vloop, vshared)
     //_types           (&_arena, vloop, body()),
     //_dependence_graph(&_arena, vloop, memory_slices(), body())
     // TODO modules
@@ -335,7 +381,7 @@ class VLoopAnalyzer : StackObj {
   const VLoop& vloop()                           const { return _vloop; }
   const VLoopReductions& reductions()            const { return _reductions; }
   const VLoopMemorySlices& memory_slices()       const { return _memory_slices; }
-  //const VLoopBody& body()                        const { return _body; }
+  const VLoopBody& body()                        const { return _body; }
   //const VLoopTypes& types()                      const { return _types; }
   //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; }
   // TODO

From 3cf41a5a8d4487dc1d90f9e2fc8a127f3c3cce96 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 22:50:04 +0100
Subject: [PATCH 05/13] VLoopTypes

---
 src/hotspot/share/opto/superword.cpp          |  77 +++++--------
 src/hotspot/share/opto/superword.hpp          |  55 +++++----
 .../share/opto/traceAutoVectorizationTag.hpp  |   3 +-
 src/hotspot/share/opto/vectorization.cpp      |   2 +
 src/hotspot/share/opto/vectorization.hpp      | 106 ++++++++++++++++--
 5 files changed, 158 insertions(+), 85 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 15d13358c9be7..39344aee3a7b4 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -460,9 +460,6 @@ bool SuperWord::SLP_extract() {
   // compute function depth(Node*)
   compute_max_depth();
 
-  // Compute vector element types
-  compute_vector_element_type();
-
   // Attempt vectorization
   find_adjacent_refs();
 
@@ -1120,13 +1117,6 @@ void SuperWord::set_alignment(Node* s1, Node* s2, int align) {
   }
 }
 
-//------------------------------data_size---------------------------
-int SuperWord::data_size(Node* s) {
-  int bsize = type2aelembytes(velt_basic_type(s));
-  assert(bsize != 0, "valid size");
-  return bsize;
-}
-
 //------------------------------extend_packlist---------------------------
 // Extend packset by following use->def and def->use links from pack members.
 void SuperWord::extend_packlist() {
@@ -3094,30 +3084,29 @@ int SuperWord::max_vector_size_in_def_use_chain(Node* n) {
   return max < 2 ? Matcher::max_vector_size_auto_vectorization(bt) : max;
 }
 
-//-------------------------compute_vector_element_type-----------------------
-// Compute necessary vector element type for expressions
-// This propagates backwards a narrower integer type when the
-// upper bits of the value are not needed.
-// Example:  char a,b,c;  a = b + c;
-// Normally the type of the add is integer, but for packed character
-// operations the type of the add needs to be char.
-void SuperWord::compute_vector_element_type() {
+void VLoopTypes::compute_vector_element_type() {
 #ifndef PRODUCT
-  if (is_trace_superword_vector_element_type()) {
-    tty->print_cr("\ncompute_velt_type:");
+  if (vloop().is_trace_vector_element_type()) {
+    tty->print_cr("\nVLoopTypes::compute_vector_element_type:");
   }
 #endif
 
+  const GrowableArray<Node*>& body = _body.body();
+
+  assert(_velt_type.is_empty(), "must not yet be computed");
+  // reserve space
+  _velt_type.at_put_grow(body.length()-1, nullptr);
+
   // Initial type
-  for (int i = 0; i < body().length(); i++) {
-    Node* n = body().at(i);
+  for (int i = 0; i < body.length(); i++) {
+    Node* n = body.at(i);
     set_velt_type(n, container_type(n));
   }
 
   // Propagate integer narrowed type backwards through operations
   // that don't depend on higher order bits
-  for (int i = body().length() - 1; i >= 0; i--) {
-    Node* n = body().at(i);
+  for (int i = body.length() - 1; i >= 0; i--) {
+    Node* n = body.at(i);
     // Only integer types need be examined
     const Type* vtn = velt_type(n);
     if (vtn->basic_type() == T_INT) {
@@ -3127,12 +3116,14 @@ void SuperWord::compute_vector_element_type() {
       for (uint j = start; j < end; j++) {
         Node* in  = n->in(j);
         // Don't propagate through a memory
-        if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT &&
+        if (!in->is_Mem() &&
+            vloop().in_bb(in) &&
+            velt_type(in)->basic_type() == T_INT &&
             data_size(n) < data_size(in)) {
           bool same_type = true;
           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
             Node *use = in->fast_out(k);
-            if (!in_bb(use) || !same_velt_type(use, n)) {
+            if (!vloop().in_bb(use) || !same_velt_type(use, n)) {
               same_type = false;
               break;
             }
@@ -3149,7 +3140,9 @@ void SuperWord::compute_vector_element_type() {
             int op = in->Opcode();
             if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) {
               Node* load = in->in(1);
-              if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) {
+              if (load->is_Load() &&
+                  vloop().in_bb(load) &&
+                  (velt_type(load)->basic_type() == T_INT)) {
                 // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
                 // (LoadUS/LoadUB) values. Store nodes only have one version.
                 vt = velt_type(load);
@@ -3165,16 +3158,17 @@ void SuperWord::compute_vector_element_type() {
       }
     }
   }
-  for (int i = 0; i < body().length(); i++) {
-    Node* n = body().at(i);
+  for (int i = 0; i < body.length(); i++) {
+    Node* n = body.at(i);
     Node* nn = n;
     if (nn->is_Bool() && nn->in(0) == nullptr) {
       nn = nn->in(1);
       assert(nn->is_Cmp(), "always have Cmp above Bool");
     }
     if (nn->is_Cmp() && nn->in(0) == nullptr) {
-      assert(in_bb(nn->in(1)) || in_bb(nn->in(2)), "one of the inputs must be in the loop too");
-      if (in_bb(nn->in(1))) {
+      assert(vloop().in_bb(nn->in(1)) || vloop().in_bb(nn->in(2)),
+             "one of the inputs must be in the loop too");
+      if (vloop().in_bb(nn->in(1))) {
         set_velt_type(n, velt_type(nn->in(1)));
       } else {
         set_velt_type(n, velt_type(nn->in(2)));
@@ -3182,9 +3176,9 @@ void SuperWord::compute_vector_element_type() {
     }
   }
 #ifndef PRODUCT
-  if (is_trace_superword_vector_element_type()) {
-    for (int i = 0; i < body().length(); i++) {
-      Node* n = body().at(i);
+  if (vloop().is_trace_vector_element_type()) {
+    for (int i = 0; i < body.length(); i++) {
+      Node* n = body.at(i);
       velt_type(n)->dump();
       tty->print("\t");
       n->dump();
@@ -3223,9 +3217,8 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
   return off_mod;
 }
 
-//---------------------------container_type---------------------------
 // Smallest type containing range of values
-const Type* SuperWord::container_type(Node* n) {
+const Type* VLoopTypes::container_type(Node* n) const {
   if (n->is_Mem()) {
     BasicType bt = n->as_Mem()->memory_type();
     if (n->is_Store() && (bt == T_CHAR)) {
@@ -3242,7 +3235,7 @@ const Type* SuperWord::container_type(Node* n) {
     }
     return Type::get_const_basic_type(bt);
   }
-  const Type* t = igvn().type(n);
+  const Type* t = vloop().phase()->igvn().type(n);
   if (t->basic_type() == T_INT) {
     // A narrow type of arithmetic operations will be determined by
     // propagating the type of memory operations.
@@ -3251,16 +3244,6 @@ const Type* SuperWord::container_type(Node* n) {
   return t;
 }
 
-bool SuperWord::same_velt_type(Node* n1, Node* n2) {
-  const Type* vt1 = velt_type(n1);
-  const Type* vt2 = velt_type(n2);
-  if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
-    // Compare vectors element sizes for integer types.
-    return data_size(n1) == data_size(n2);
-  }
-  return vt1 == vt2;
-}
-
 bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
   return vloop().phase()->C->get_alias_index(m1->adr_type()) ==
          vloop().phase()->C->get_alias_index(m2->adr_type());
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index fa8563112bba5..efa101c4d060c 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -187,10 +187,9 @@ class SWNodeInfo {
  public:
   int         _alignment; // memory alignment for a node
   int         _depth;     // Max expression (DAG) depth from block start
-  const Type* _velt_type; // vector element type
   Node_List*  _my_pack;   // pack containing this node
 
-  SWNodeInfo() : _alignment(-1), _depth(0), _velt_type(nullptr), _my_pack(nullptr) {}
+  SWNodeInfo() : _alignment(-1), _depth(0), _my_pack(nullptr) {}
   static const SWNodeInfo initial;
 };
 
@@ -250,7 +249,7 @@ class SuperWord : public ResourceObj {
     return vloop_analyzer().memory_slices().same_memory_slice(n1, n2);
   }
 
-  // VLoopAnalyzer body
+  // VLoopBody Accessors
   const GrowableArray<Node*>& body() const {
     return vloop_analyzer().body().body();
   }
@@ -259,13 +258,33 @@ class SuperWord : public ResourceObj {
     return vloop_analyzer().body().bb_idx(n);
   }
 
-#ifndef PRODUCT
-  // TraceAutoVectorization and TraceSuperWord
-  bool is_trace_superword_vector_element_type() const {
-    // Too verbose for TraceSuperWord
-    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES);
+  // VLoopTypes Accessors
+  const Type* velt_type(Node* n) const {
+    return vloop_analyzer().types().velt_type(n);
+  }
+
+  BasicType velt_basic_type(Node* n) const {
+    return vloop_analyzer().types().velt_basic_type(n);
+  }
+
+  bool same_velt_type(Node* n1, Node* n2) const {
+    return vloop_analyzer().types().same_velt_type(n1, n2);
+  }
+
+  int data_size(Node* n) const {
+    return vloop_analyzer().types().data_size(n);
+  }
+
+  int vector_width(Node* n) const {
+    return vloop_analyzer().types().vector_width(n);
   }
 
+  int vector_width_in_bytes(const Node* n) const {
+    return vloop_analyzer().types().vector_width_in_bytes(n);
+  }
+
+#ifndef PRODUCT
+  // TraceAutoVectorization and TraceSuperWord
   bool is_trace_superword_alignment() const {
     // Too verbose for TraceSuperWord
     return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
@@ -304,7 +323,6 @@ class SuperWord : public ResourceObj {
   bool is_trace_superword_any() const {
     return TraceSuperWord ||
            is_trace_align_vector() ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
            vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
@@ -333,14 +351,6 @@ class SuperWord : public ResourceObj {
   // Accessors
   Arena* arena()                   { return &_arena; }
 
-  int vector_width(const Node* n) const {
-    BasicType bt = velt_basic_type(n);
-    return MIN2(ABS(iv_stride()), Matcher::max_vector_size(bt));
-  }
-  int vector_width_in_bytes(const Node* n) const {
-    BasicType bt = velt_basic_type(n);
-    return vector_width(n)*type2aelembytes(bt);
-  }
   int get_vw_bytes_special(MemNode* s);
   const MemNode* align_to_ref() const { return _align_to_ref; }
   void set_align_to_ref(const MemNode* m) { _align_to_ref = m; }
@@ -359,12 +369,6 @@ class SuperWord : public ResourceObj {
   int depth(Node* n) const                   { return _node_info.adr_at(bb_idx(n))->_depth; }
   void set_depth(Node* n, int d)             { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_depth = d; }
 
-  // vector element type
-  const Type* velt_type(const Node* n) const { return _node_info.adr_at(bb_idx(n))->_velt_type; }
-  BasicType velt_basic_type(const Node* n) const { return velt_type(n)->array_element_basic_type(); }
-  void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; }
-  bool same_velt_type(Node* n1, Node* n2);
-
   // my_pack
  public:
   Node_List* my_pack(Node* n)                 { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; }
@@ -405,7 +409,6 @@ class SuperWord : public ResourceObj {
   // do s1 and s2 have similar input edges?
   bool have_similar_inputs(Node* s1, Node* s2);
   void set_alignment(Node* s1, Node* s2, int align);
-  int data_size(Node* s);
   // Extend packset by following use->def and def->use links from pack members.
   void extend_packlist();
   int adjust_alignment_for_type_conversion(Node* s, Node* t, int align);
@@ -460,8 +463,6 @@ class SuperWord : public ResourceObj {
   BasicType longer_type_for_conversion(Node* n);
   // Find the longest type in def-use chain for packed nodes, and then compute the max vector size.
   int max_vector_size_in_def_use_chain(Node* n);
-  // Compute necessary vector element type for expressions
-  void compute_vector_element_type();
   // Are s1 and s2 in a pack pair and ordered as s1,s2?
   bool in_packset(Node* s1, Node* s2);
   // Remove the pack at position pos in the packset
@@ -469,8 +470,6 @@ class SuperWord : public ResourceObj {
   static LoadNode::ControlDependency control_dependency(Node_List* p);
   // Alignment within a vector memory reference
   int memory_alignment(MemNode* s, int iv_adjust);
-  // Smallest type containing range of values
-  const Type* container_type(Node* n);
   // Ensure that the main loop vectors are aligned by adjusting the pre loop limit.
   void adjust_pre_loop_limit_to_align_main_loop_vectors();
   // Is the use of d1 in u1 at the same operand position as d2 in u2?
diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
index 5121634285b90..615f9230f3ae4 100644
--- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
+++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp
@@ -34,7 +34,7 @@
   flags(LOOP_ANALYZER,        "Trace VLoopAnalyzer::setup_submodules") \
   flags(MEMORY_SLICES,        "Trace VLoopMemorySlices") \
   flags(BODY,                 "Trace VLoopBody") \
-  flags(SW_TYPES,             "Trace SuperWord::compute_vector_element_type") \
+  flags(TYPES,                "Trace VLoopTypes") \
   flags(SW_ALIGNMENT,         "Trace SuperWord alignment analysis") \
   flags(SW_DEPENDENCE_GRAPH,  "Trace SuperWord::dependence_graph") \
   flags(SW_ADJACENT_MEMOPS,   "Trace SuperWord::find_adjacent_refs") \
@@ -114,7 +114,6 @@ class TraceAutoVectorizationTagValidator {
       } else if (ALL == tag) {
         _tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM);
       } else if (SW_VERBOSE == tag) {
-        _tags.at_put(SW_TYPES, set_bit);
         _tags.at_put(SW_ALIGNMENT, set_bit);
         _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit);
         _tags.at_put(SW_ADJACENT_MEMOPS, set_bit);
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index d90dea194ecf8..0fbb337f26323 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -163,6 +163,8 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
     return body_failure;
   }
 
+  _types.compute_vector_element_type();
+
   // TODO
   return VLoopAnalyzer::SUCCESS;
 }
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 0ae2bb7a12da9..39ad48ce0f4f2 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -124,6 +124,10 @@ class VLoop : public StackObj {
     return vtrace().is_trace(TraceAutoVectorizationTag::BODY);
   }
 
+  bool is_trace_vector_element_type() const {
+    return vtrace().is_trace(TraceAutoVectorizationTag::TYPES);
+  }
+
   bool is_trace_pointer_analysis() const {
     return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
   }
@@ -330,6 +334,92 @@ class VLoopBody : public StackObj {
   }
 };
 
+// Submodule of VLoopAnalyzer.
+// Compute the vector element type for every node in the loop body.
+// We need to do this to be able to vectorize the narrower integer
+// types (byte, char, short). In the C2 IR, their operations are
+// done with full int type with 4 byte precision (e.g. AddI, MulI).
+// Example:  char a,b,c;  a = (char)(b + c);
+// However, if we can prove the the upper bits are only truncated,
+// and the lower bits for the narrower type computed correctly, we
+// can compute the operations in the narrower type directly (e.g we
+// perform the AddI or MulI with 1 or 2 bytes). This allows us to
+// fit more operations in a vector, and can remove the otherwise
+// required conversion (int <-> narrower type).
+// We compute the types backwards (use-to-def): If all use nodes
+// only require the lower bits, then the def node can do the operation
+// with only the lower bits, and we propagate the narrower type to it.
+class VLoopTypes : public StackObj {
+private:
+  const VLoop&     _vloop;
+  const VLoopBody& _body;
+
+  // bb_idx -> vector element type
+  GrowableArray<const Type*> _velt_type;
+
+  const VLoop& vloop() const    { return _vloop; }
+  const VLoopBody& body() const { return _body; }
+
+public:
+  VLoopTypes(Arena* arena,
+             const VLoop& vloop,
+             const VLoopBody& body) :
+    _vloop(vloop),
+    _body(body),
+    _velt_type(arena, vloop.estimated_body_length(), 0, nullptr) {}
+  NONCOPYABLE(VLoopTypes);
+
+  void compute_vector_element_type();
+  NOT_PRODUCT( void print() const; )
+
+  const Type* velt_type(const Node* n) const {
+    assert(vloop().in_bb(n), "only call on nodes in loop");
+    const Type* t = _velt_type.at(body().bb_idx(n));
+    assert(t != nullptr, "must have type");
+    return t;
+  }
+
+  BasicType velt_basic_type(const Node* n) const {
+    return velt_type(n)->array_element_basic_type();
+  }
+
+  int data_size(Node* s) const {
+    int bsize = type2aelembytes(velt_basic_type(s));
+    assert(bsize != 0, "valid size");
+    return bsize;
+  }
+
+  bool same_velt_type(Node* n1, Node* n2) const {
+    const Type* vt1 = velt_type(n1);
+    const Type* vt2 = velt_type(n2);
+    if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) {
+      // Compare vectors element sizes for integer types.
+      return data_size(n1) == data_size(n2);
+    }
+    return vt1 == vt2;
+  }
+
+  int vector_width(const Node* n) const {
+    BasicType bt = velt_basic_type(n);
+    return MIN2(ABS(_vloop.iv_stride()), Matcher::max_vector_size(bt));
+  }
+
+  int vector_width_in_bytes(const Node* n) const {
+    BasicType bt = velt_basic_type(n);
+    return vector_width(n) * type2aelembytes(bt);
+  }
+
+private:
+  void set_velt_type(Node* n, const Type* t) {
+    assert(t != nullptr, "cannot set nullptr");
+    assert(vloop().in_bb(n), "only call on nodes in loop");
+    _velt_type.at_put(body().bb_idx(n), t);
+  }
+
+  // Smallest type containing range of values
+  const Type* container_type(Node* n) const;
+};
+
 // Analyze the loop in preparation for auto-vectorization. This class is
 // deliberately structured into many submodules, which are as independent
 // as possible, though some submodules do require other submodules.
@@ -340,21 +430,21 @@ class VLoopAnalyzer : StackObj {
   static constexpr char const* FAILURE_NO_MAX_UNROLL         = "slp max unroll analysis required";
   static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop";
 
-  const VLoop&               _vloop;
+  const VLoop&         _vloop;
 
   // Arena for all submodules
-  Arena                      _arena;
+  Arena                _arena;
 
   // If all submodules are setup successfully, we set this flag at the
   // end of the constructor
-  bool                       _success;
+  bool                 _success;
 
   // Submodules
   // TODO
-  VLoopReductions            _reductions;
+  VLoopReductions      _reductions;
   VLoopMemorySlices    _memory_slices;
   VLoopBody            _body;
-  //VLoopTypes           _types;
+  VLoopTypes           _types;
   //VLoopDependenceGraph _dependence_graph;
 
 public:
@@ -364,8 +454,8 @@ class VLoopAnalyzer : StackObj {
     _success(false),
     _reductions      (&_arena, vloop),
     _memory_slices   (&_arena, vloop),
-    _body            (&_arena, vloop, vshared)
-    //_types           (&_arena, vloop, body()),
+    _body            (&_arena, vloop, vshared),
+    _types           (&_arena, vloop, body())
     //_dependence_graph(&_arena, vloop, memory_slices(), body())
     // TODO modules
   {
@@ -382,7 +472,7 @@ class VLoopAnalyzer : StackObj {
   const VLoopReductions& reductions()            const { return _reductions; }
   const VLoopMemorySlices& memory_slices()       const { return _memory_slices; }
   const VLoopBody& body()                        const { return _body; }
-  //const VLoopTypes& types()                      const { return _types; }
+  const VLoopTypes& types()                      const { return _types; }
   //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; }
   // TODO
 

From a69bacf662d4bf305e7a10f9461f0c009978a3ea Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sat, 10 Feb 2024 23:03:54 +0100
Subject: [PATCH 06/13] remove some comments

---
 src/hotspot/share/opto/superword.cpp     | 3 ---
 src/hotspot/share/opto/vectorization.cpp | 1 -
 src/hotspot/share/opto/vectorization.hpp | 7 -------
 3 files changed, 11 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 39344aee3a7b4..52c63dcc68943 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -376,7 +376,6 @@ void VLoopReductions::mark_reductions() {
     }
     // Reduction cycle found. Mark all nodes in the found path as reductions.
     current = first;
-    // TODO trace this
     for (int i = 0; i < path_nodes; i++) {
       _loop_reductions.set(current->_idx);
       current = original_input(current, reduction_input);
@@ -449,8 +448,6 @@ bool SuperWord::transform_loop() {
 bool SuperWord::SLP_extract() {
   assert(cl()->is_main_loop(), "SLP should only work on main loops");
 
-  // TODO remove all the VLoopAnalyzer stuff
-
   // Ensure extra info is allocated.
   initialize_node_info();
 
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 0fbb337f26323..93bdecb02e257 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -165,7 +165,6 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
 
   _types.compute_vector_element_type();
 
-  // TODO
   return VLoopAnalyzer::SUCCESS;
 }
 
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 39ad48ce0f4f2..1d32ff694a660 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -425,7 +425,6 @@ class VLoopTypes : public StackObj {
 // as possible, though some submodules do require other submodules.
 class VLoopAnalyzer : StackObj {
 private:
-  // TODO check if all are really needed
   static constexpr char const* SUCCESS                       = "success";
   static constexpr char const* FAILURE_NO_MAX_UNROLL         = "slp max unroll analysis required";
   static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop";
@@ -440,12 +439,10 @@ class VLoopAnalyzer : StackObj {
   bool                 _success;
 
   // Submodules
-  // TODO
   VLoopReductions      _reductions;
   VLoopMemorySlices    _memory_slices;
   VLoopBody            _body;
   VLoopTypes           _types;
-  //VLoopDependenceGraph _dependence_graph;
 
 public:
   VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) :
@@ -456,8 +453,6 @@ class VLoopAnalyzer : StackObj {
     _memory_slices   (&_arena, vloop),
     _body            (&_arena, vloop, vshared),
     _types           (&_arena, vloop, body())
-    //_dependence_graph(&_arena, vloop, memory_slices(), body())
-    // TODO modules
   {
     _success = setup_submodules();
   }
@@ -473,8 +468,6 @@ class VLoopAnalyzer : StackObj {
   const VLoopMemorySlices& memory_slices()       const { return _memory_slices; }
   const VLoopBody& body()                        const { return _body; }
   const VLoopTypes& types()                      const { return _types; }
-  //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; }
-  // TODO
 
 private:
   bool setup_submodules();

From b43d513b2f0fff1ffaad4bb3718798706eb46d0d Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Sun, 11 Feb 2024 14:46:24 +0100
Subject: [PATCH 07/13] move _loop_or_ctrl from ResouceArena, bc ResourceMark
 in SuperWord::dependence_graph

---
 src/hotspot/share/opto/loopnode.hpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp
index 3b281e0f77de4..b1a0d95ddf266 100644
--- a/src/hotspot/share/opto/loopnode.hpp
+++ b/src/hotspot/share/opto/loopnode.hpp
@@ -1098,6 +1098,7 @@ class PhaseIdealLoop : public PhaseTransform {
   // Compute the Ideal Node to Loop mapping
   PhaseIdealLoop(PhaseIterGVN& igvn, LoopOptsMode mode) :
     PhaseTransform(Ideal_Loop),
+    _loop_or_ctrl(igvn.C->comp_arena()),
     _igvn(igvn),
     _verify_me(nullptr),
     _verify_only(false),
@@ -1112,6 +1113,7 @@ class PhaseIdealLoop : public PhaseTransform {
   // or only verify that the graph is valid if verify_me is null.
   PhaseIdealLoop(PhaseIterGVN& igvn, const PhaseIdealLoop* verify_me = nullptr) :
     PhaseTransform(Ideal_Loop),
+    _loop_or_ctrl(igvn.C->comp_arena()),
     _igvn(igvn),
     _verify_me(verify_me),
     _verify_only(verify_me == nullptr),

From 19acaef2ccd6ea7fd1eb66abeea466dc00c76228 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 14 Feb 2024 17:00:29 +0100
Subject: [PATCH 08/13] VStatus

---
 src/hotspot/share/opto/superword.cpp     |  6 +-
 src/hotspot/share/opto/vectorization.cpp | 72 +++++++++++-------------
 src/hotspot/share/opto/vectorization.hpp | 30 ++++++++--
 3 files changed, 62 insertions(+), 46 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index 04b9c910635a5..b0619af928ff6 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -2929,7 +2929,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
 }
 
 // Return nullptr if success, else failure message
-const char* VLoopBody::construct() {
+VStatus VLoopBody::construct() {
   assert(_body.length() == 0, "body is empty");
 
   // First pass over loop body:
@@ -2953,7 +2953,7 @@ const char* VLoopBody::construct() {
           n->dump();
         }
 #endif
-        return VLoopBody::FAILURE_NODE_NOT_ALLOWED;
+        return VStatus::make_failure(VLoopBody::FAILURE_NODE_NOT_ALLOWED);
       }
 
 #ifdef ASSERT
@@ -3025,7 +3025,7 @@ const char* VLoopBody::construct() {
 #endif
 
   assert(rpo_idx == -1 && body_count == _body.length(), "all body members found");
-  return nullptr; // success
+  return VStatus::make_success();
 }
 
 // Initialize per node info
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 93bdecb02e257..5f268bced090f 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -40,40 +40,38 @@ bool VLoop::check_preconditions() {
   }
 #endif
 
-  const char* return_state = check_preconditions_helper();
-  assert(return_state != nullptr, "must have return state");
-  if (return_state == VLoop::SUCCESS) {
-    return true; // success
-  }
-
+  VStatus status = check_preconditions_helper();
+  if (!status.is_success()) {
 #ifndef PRODUCT
-  if (is_trace_preconditions()) {
-    tty->print_cr("VLoop::check_preconditions: failed: %s", return_state);
-  }
+    if (is_trace_preconditions()) {
+      tty->print_cr("VLoop::check_preconditions: failed: %s", status.failure_reason());
+    }
 #endif
-  return false; // failure
+    return false; // failure
+  }
+  return true; // success
 }
 
-const char* VLoop::check_preconditions_helper() {
+VStatus VLoop::check_preconditions_helper() {
   // Only accept vector width that is power of 2
   int vector_width = Matcher::vector_width_in_bytes(T_BYTE);
   if (vector_width < 2 || !is_power_of_2(vector_width)) {
-    return VLoop::FAILURE_VECTOR_WIDTH;
+    return VStatus::make_failure(VLoop::FAILURE_VECTOR_WIDTH);
   }
 
   // Only accept valid counted loops (int)
   if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) {
-    return VLoop::FAILURE_VALID_COUNTED_LOOP;
+    return VStatus::make_failure(VLoop::FAILURE_VALID_COUNTED_LOOP);
   }
   _cl = _lpt->_head->as_CountedLoop();
   _iv = _cl->phi()->as_Phi();
 
   if (_cl->is_vectorized_loop()) {
-    return VLoop::FAILURE_ALREADY_VECTORIZED;
+    return VStatus::make_failure(VLoop::FAILURE_ALREADY_VECTORIZED);
   }
 
   if (_cl->is_unroll_only()) {
-    return VLoop::FAILURE_UNROLL_ONLY;
+    return VStatus::make_failure(VLoop::FAILURE_UNROLL_ONLY);
   }
 
   // Check for control flow in the body
@@ -89,12 +87,12 @@ const char* VLoop::check_preconditions_helper() {
       _lpt->dump_head();
     }
 #endif
-    return VLoop::FAILURE_CONTROL_FLOW;
+    return VStatus::make_failure(VLoop::FAILURE_CONTROL_FLOW);
   }
 
   // Make sure the are no extra control users of the loop backedge
   if (_cl->back_control()->outcnt() != 1) {
-    return VLoop::FAILURE_BACKEDGE;
+    return VStatus::make_failure(VLoop::FAILURE_BACKEDGE);
   }
 
   // To align vector memory accesses in the main-loop, we will have to adjust
@@ -102,16 +100,16 @@ const char* VLoop::check_preconditions_helper() {
   if (_cl->is_main_loop()) {
     CountedLoopEndNode* pre_end = _cl->find_pre_loop_end();
     if (pre_end == nullptr) {
-      return VLoop::FAILURE_PRE_LOOP_LIMIT;
+      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
     }
     Node* pre_opaq1 = pre_end->limit();
     if (pre_opaq1->Opcode() != Op_Opaque1) {
-      return VLoop::FAILURE_PRE_LOOP_LIMIT;
+      return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT);
     }
     _pre_loop_end = pre_end;
   }
 
-  return VLoop::SUCCESS;
+  return VStatus::make_success();
 }
 
 // Return true iff all submodules are loaded successfully
@@ -124,24 +122,22 @@ bool VLoopAnalyzer::setup_submodules() {
   }
 #endif
 
-  const char* state = setup_submodules_helper();
-    if (state == VLoopAnalyzer::SUCCESS) {
-    return true; // success
-  }
-
+  VStatus status = setup_submodules_helper();
+  if (!status.is_success()) {
 #ifndef PRODUCT
-  if (vloop().is_trace_loop_analyzer()) {
-    tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", state);
-  }
+    if (vloop().is_trace_loop_analyzer()) {
+      tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason());
+    }
 #endif
-  return false; // failed
+    return false; // failed
+  }
+  return true; // success
 }
 
-// Return SUCCESS string iff all submodules are setup successfully
-const char* VLoopAnalyzer::setup_submodules_helper() {
+VStatus VLoopAnalyzer::setup_submodules_helper() {
   // Skip any loop that has not been assigned max unroll by analysis.
   if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
-    return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL;
+    return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL);
   }
 
   if (SuperWordReductions) {
@@ -150,22 +146,22 @@ const char* VLoopAnalyzer::setup_submodules_helper() {
 
   _memory_slices.find_memory_slices();
 
-  // If there is no memory slice detected, that means there is no store.
+  // If there is no memory slice detected, it means there is no store.
   // If there is no reduction and no store, then we give up, because
   // vectorization is not possible anyway (given current limitations).
   if (!reductions().is_marked_reduction_loop() &&
       _memory_slices.heads().is_empty()) {
-    return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE;
+    return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE);
   }
 
-  const char* body_failure = _body.construct();
-  if (body_failure != nullptr) {
-    return body_failure;
+  VStatus body_status = _body.construct();
+  if (!body_status.is_success()) {
+    return body_status;
   }
 
   _types.compute_vector_element_type();
 
-  return VLoopAnalyzer::SUCCESS;
+  return VStatus::make_success();
 }
 
 #ifndef PRODUCT
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 1d32ff694a660..7b2f3e340db40 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -33,6 +33,28 @@
 // Code in this file and the vectorization.cpp contains shared logics and
 // utilities for C2's loop auto-vectorization.
 
+class VStatus : public StackObj {
+private:
+  const char* _failure_reason;
+
+  VStatus(const char* failure_reason) : _failure_reason(failure_reason) {}
+
+public:
+  static VStatus make_success() { return VStatus(nullptr); }
+
+  static VStatus make_failure(const char* failure_reason) {
+    assert(failure_reason != nullptr, "must have reason");
+    return VStatus(failure_reason);
+  }
+
+  bool is_success() const { return _failure_reason == nullptr; }
+
+  const char* failure_reason() const {
+    assert(!is_success(), "only failures have reason");
+    return _failure_reason;
+  }
+};
+
 #ifndef PRODUCT
 // Access to TraceAutoVectorization tags
 class VTrace : public StackObj {
@@ -62,7 +84,6 @@ class VLoop : public StackObj {
 
   NOT_PRODUCT(VTrace _vtrace;)
 
-  static constexpr char const* SUCCESS                    = "success";
   static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized";
   static constexpr char const* FAILURE_UNROLL_ONLY        = "loop only wants to be unrolled";
   static constexpr char const* FAILURE_VECTOR_WIDTH       = "vector_width must be power of 2";
@@ -145,7 +166,7 @@ class VLoop : public StackObj {
   bool check_preconditions();
 
 private:
-  const char* check_preconditions_helper();
+  VStatus check_preconditions_helper();
 };
 
 // Optimization to keep allocation of large arrays in AutoVectorization low.
@@ -319,7 +340,7 @@ class VLoopBody : public StackObj {
 
   NONCOPYABLE(VLoopBody);
 
-  const char* construct();
+  VStatus construct();
   const GrowableArray<Node*>& body() const { return _body; }
   NOT_PRODUCT( void print() const; )
 
@@ -425,7 +446,6 @@ class VLoopTypes : public StackObj {
 // as possible, though some submodules do require other submodules.
 class VLoopAnalyzer : StackObj {
 private:
-  static constexpr char const* SUCCESS                       = "success";
   static constexpr char const* FAILURE_NO_MAX_UNROLL         = "slp max unroll analysis required";
   static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop";
 
@@ -471,7 +491,7 @@ class VLoopAnalyzer : StackObj {
 
 private:
   bool setup_submodules();
-  const char* setup_submodules_helper();
+  VStatus setup_submodules_helper();
 };
 
 // A vectorization pointer (VPointer) has information about an address for

From 6c28172c8f4c7440e1562aaa1974446e934245a3 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 14 Feb 2024 17:07:09 +0100
Subject: [PATCH 09/13] Apply suggestions from code review

thanks Christian

Co-authored-by: Christian Hagedorn <christian.hagedorn@oracle.com>
---
 src/hotspot/share/opto/superword.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index b0619af928ff6..becf53b8ac402 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -758,14 +758,14 @@ void SuperWord::dependence_graph() {
 
   // First, assign a dependence node to each memory node
   for (int i = 0; i < body().length(); i++ ) {
-    Node *n = body().at(i);
+    Node* n = body().at(i);
     if (n->is_Mem() || n->is_memory_phi()) {
       _dg.make_node(n);
     }
   }
 
-  const GrowableArray<PhiNode*> &mem_slice_head = vloop_analyzer().memory_slices().heads();
-  const GrowableArray<MemNode*> &mem_slice_tail = vloop_analyzer().memory_slices().tails();
+  const GrowableArray<PhiNode*>& mem_slice_head = vloop_analyzer().memory_slices().heads();
+  const GrowableArray<MemNode*>& mem_slice_tail = vloop_analyzer().memory_slices().tails();
 
   ResourceMark rm;
   GrowableArray<Node*> slice_nodes;

From ba07a799a626b3ef8e50de7d06999c9acd686041 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Wed, 14 Feb 2024 17:12:34 +0100
Subject: [PATCH 10/13] more for Christian

---
 src/hotspot/share/opto/vectorization.hpp | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 7b2f3e340db40..827a9a3c26df5 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -258,23 +258,29 @@ class VLoopReductions : public StackObj {
   // that it assumes counted loops and requires that reduction nodes are not
   // used within the loop except by their reduction cycle predecessors.
   void mark_reductions();
+
   // Whether n is a reduction operator and part of a reduction cycle.
   // This function can be used for individual queries outside auto-vectorization,
   // e.g. to inform matching in target-specific code. Otherwise, the
   // almost-equivalent but faster mark_reductions() is preferable.
   static bool is_reduction(const Node* n);
+
   // Whether n is marked as a reduction node.
   bool is_marked_reduction(const Node* n) const { return _loop_reductions.test(n->_idx); }
+
   bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); }
+
   // Are s1 and s2 reductions with a data path between them?
   bool is_marked_reduction_pair(Node* s1, Node* s2) const;
 
 private:
   // Whether n is a standard reduction operator.
   static bool is_reduction_operator(const Node* n);
+
   // Whether n is part of a reduction cycle via the 'input' edge index. To bound
   // the search, constrain the size of reduction cycles to LoopMaxUnroll.
   static bool in_reduction_cycle(const Node* n, uint input);
+
   // Reference to the i'th input node of n, commuting the inputs of binary nodes
   // whose edges have been swapped. Assumes n is a commutative operation.
   static Node* original_input(const Node* n, uint i);
@@ -480,8 +486,6 @@ class VLoopAnalyzer : StackObj {
 
   bool success() const { return _success; }
 
-  Arena* arena()       { return &_arena; }
-
   // Read-only accessors for submodules
   const VLoop& vloop()                           const { return _vloop; }
   const VLoopReductions& reductions()            const { return _reductions; }

From 1d771fdda4f3d6bbce009939762db883c00986e4 Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 15 Feb 2024 19:26:49 +0100
Subject: [PATCH 11/13] remove accessors, use fields instead

---
 src/hotspot/share/opto/superword.cpp     | 113 ++++++++++++-----------
 src/hotspot/share/opto/superword.hpp     |  69 +++++++-------
 src/hotspot/share/opto/vectorization.cpp |  12 +--
 src/hotspot/share/opto/vectorization.hpp |  39 +++-----
 4 files changed, 111 insertions(+), 122 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index becf53b8ac402..e4cce5f67fa9e 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -40,9 +40,10 @@
 
 SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) :
   _vloop_analyzer(vloop_analyzer),
+  _vloop(vloop_analyzer.vloop()),
   _arena(mtCompiler),
   _packset(arena(), 8,  0, nullptr),                        // packs for the current block
-  _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
+  _node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node
   _clone_map(phase()->C->clone_map()),                      // map of nodes created in cloning
   _align_to_ref(nullptr),                                   // memory reference to align vectors to
   _dg(arena()),                                             // dependence graph
@@ -299,7 +300,7 @@ Node* VLoopReductions::original_input(const Node* n, uint i) {
 
 void VLoopReductions::mark_reductions() {
   assert(_loop_reductions.is_empty(), "must not yet be computed");
-  CountedLoopNode* cl = vloop().cl();
+  CountedLoopNode* cl = _vloop.cl();
 
   // Iterate through all phi nodes associated to the loop and search for
   // reduction cycles in the basic block.
@@ -311,7 +312,7 @@ void VLoopReductions::mark_reductions() {
     if (phi->outcnt() == 0) {
       continue;
     }
-    if (phi == vloop().iv()) {
+    if (phi == _vloop.iv()) {
       continue;
     }
     // The phi's loop-back is considered the first node in the reduction cycle.
@@ -335,9 +336,9 @@ void VLoopReductions::mark_reductions() {
       // to the phi node following edge index 'input'.
       PathEnd path =
         find_in_path(
-          first, input, vloop().lpt()->_body.size(),
+          first, input, _vloop.lpt()->_body.size(),
           [&](const Node* n) { return n->Opcode() == first->Opcode() &&
-                                      vloop().in_bb(n); },
+                                      _vloop.in_bb(n); },
           [&](const Node* n) { return n == phi; });
       if (path.first != nullptr) {
         reduction_input = input;
@@ -356,7 +357,7 @@ void VLoopReductions::mark_reductions() {
     for (int i = 0; i < path_nodes; i++) {
       for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) {
         Node* u = current->fast_out(j);
-        if (!vloop().in_bb(u)) {
+        if (!_vloop.in_bb(u)) {
           continue;
         }
         if (u == succ) {
@@ -533,13 +534,13 @@ void SuperWord::find_adjacent_refs() {
       set_align_to_ref(align_to_mem_ref);
     }
 
-    VPointer align_to_ref_p(mem_ref, vloop());
+    VPointer align_to_ref_p(mem_ref, _vloop);
     // Set alignment relative to "align_to_ref" for all related memory operations.
     for (int i = memops.size() - 1; i >= 0; i--) {
       MemNode* s = memops.at(i)->as_Mem();
       if (isomorphic(s, mem_ref) &&
            (!_do_vector_loop || same_origin_idx(s, mem_ref))) {
-        VPointer p2(s, vloop());
+        VPointer p2(s, _vloop);
         if (p2.comparable(align_to_ref_p)) {
           int align = memory_alignment(s, iv_adjustment);
           set_alignment(s, align);
@@ -598,11 +599,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
   // Count number of comparable memory ops
   for (uint i = 0; i < memops.size(); i++) {
     MemNode* s1 = memops.at(i)->as_Mem();
-    VPointer p1(s1, vloop());
+    VPointer p1(s1, _vloop);
     for (uint j = i+1; j < memops.size(); j++) {
       MemNode* s2 = memops.at(j)->as_Mem();
       if (isomorphic(s1, s2)) {
-        VPointer p2(s2, vloop());
+        VPointer p2(s2, _vloop);
         if (p1.comparable(p2)) {
           (*cmp_ct.adr_at(i))++;
           (*cmp_ct.adr_at(j))++;
@@ -623,7 +624,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
     if (s->is_Store()) {
       int vw = vector_width_in_bytes(s);
       assert(vw > 1, "sanity");
-      VPointer p(s, vloop());
+      VPointer p(s, _vloop);
       if ( cmp_ct.at(j) >  max_ct ||
           (cmp_ct.at(j) == max_ct &&
             ( vw >  max_vw ||
@@ -646,7 +647,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) {
       if (s->is_Load()) {
         int vw = vector_width_in_bytes(s);
         assert(vw > 1, "sanity");
-        VPointer p(s, vloop());
+        VPointer p(s, _vloop);
         if ( cmp_ct.at(j) >  max_ct ||
             (cmp_ct.at(j) == max_ct &&
               ( vw >  max_vw ||
@@ -719,7 +720,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) {
 //---------------------------get_iv_adjustment---------------------------
 // Calculate loop's iv adjustment for this memory ops.
 int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
-  VPointer align_to_ref_p(mem_ref, vloop());
+  VPointer align_to_ref_p(mem_ref, _vloop);
   int offset = align_to_ref_p.offset_in_bytes();
   int scale  = align_to_ref_p.scale_in_bytes();
   int elt_size = align_to_ref_p.memory_size();
@@ -764,8 +765,8 @@ void SuperWord::dependence_graph() {
     }
   }
 
-  const GrowableArray<PhiNode*>& mem_slice_head = vloop_analyzer().memory_slices().heads();
-  const GrowableArray<MemNode*>& mem_slice_tail = vloop_analyzer().memory_slices().tails();
+  const GrowableArray<PhiNode*>& mem_slice_head = _vloop_analyzer.memory_slices().heads();
+  const GrowableArray<MemNode*>& mem_slice_tail = _vloop_analyzer.memory_slices().tails();
 
   ResourceMark rm;
   GrowableArray<Node*> slice_nodes;
@@ -776,7 +777,7 @@ void SuperWord::dependence_graph() {
     MemNode* tail = mem_slice_tail.at(i);
 
     // Get slice in predecessor order (last is first)
-    vloop_analyzer().memory_slices().get_slice(head, tail, slice_nodes);
+    _vloop_analyzer.memory_slices().get_slice(head, tail, slice_nodes);
 
     // Make the slice dependent on the root
     DepMem* slice = _dg.dep(head);
@@ -794,13 +795,13 @@ void SuperWord::dependence_graph() {
       if (_dg.dep(s1)->in_cnt() == 0) {
         _dg.make_edge(slice, s1);
       }
-      VPointer p1(s1->as_Mem(), vloop());
+      VPointer p1(s1->as_Mem(), _vloop);
       bool sink_dependent = true;
       for (int k = j - 1; k >= 0; k--) {
         Node* s2 = slice_nodes.at(k);
         if (s1->is_Load() && s2->is_Load())
           continue;
-        VPointer p2(s2->as_Mem(), vloop());
+        VPointer p2(s2->as_Mem(), _vloop);
 
         int cmp = p1.cmp(p2);
         if (!VPointer::not_equal(cmp)) {
@@ -831,12 +832,12 @@ void SuperWord::dependence_graph() {
 void VLoopMemorySlices::find_memory_slices() {
   assert(_heads.is_empty(), "not yet computed");
   assert(_tails.is_empty(), "not yet computed");
-  CountedLoopNode* cl = vloop().cl();
+  CountedLoopNode* cl = _vloop.cl();
 
   // Iterate over all memory phis
   for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) {
     PhiNode* phi = cl->fast_out(i)->isa_Phi();
-    if (phi != nullptr && vloop().in_bb(phi) && phi->is_memory_phi()) {
+    if (phi != nullptr && _vloop.in_bb(phi) && phi->is_memory_phi()) {
       Node* phi_tail = phi->in(LoopNode::LoopBackControl);
       if (phi_tail != phi->in(LoopNode::EntryControl)) {
         _heads.push(phi);
@@ -845,7 +846,7 @@ void VLoopMemorySlices::find_memory_slices() {
     }
   }
 
-  NOT_PRODUCT( if (vloop().is_trace_memory_slices()) { print(); } )
+  NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
 }
 
 #ifndef PRODUCT
@@ -865,19 +866,19 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray<No
   Node* n = tail;
   Node* prev = nullptr;
   while (true) {
-    assert(vloop().in_bb(n), "must be in block");
+    assert(_vloop.in_bb(n), "must be in block");
     for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
       Node* out = n->fast_out(i);
       if (out->is_Load()) {
-        if (vloop().in_bb(out)) {
+        if (_vloop.in_bb(out)) {
           slice.push(out);
         }
       } else {
         // FIXME
-        if (out->is_MergeMem() && !vloop().in_bb(out)) {
+        if (out->is_MergeMem() && !_vloop.in_bb(out)) {
           // Either unrolling is causing a memory edge not to disappear,
           // or need to run igvn.optimize() again before SLP
-        } else if (out->is_memory_phi() && !vloop().in_bb(out)) {
+        } else if (out->is_memory_phi() && !_vloop.in_bb(out)) {
           // Ditto.  Not sure what else to check further.
         } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) {
           // StoreCM has an input edge used as a precedence edge.
@@ -895,7 +896,7 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray<No
   }
 
 #ifndef PRODUCT
-  if (vloop().is_trace_memory_slices()) {
+  if (_vloop.is_trace_memory_slices()) {
     tty->print_cr("\nVLoopMemorySlices::get_slice:");
     head->dump();
     for (int j = slice.length() - 1; j >= 0 ; j--) {
@@ -970,8 +971,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) {
 
   // Adjacent memory references must have the same base, be comparable
   // and have the correct distance between them.
-  VPointer p1(s1->as_Mem(), vloop());
-  VPointer p2(s2->as_Mem(), vloop());
+  VPointer p1(s1->as_Mem(), _vloop);
+  VPointer p2(s2->as_Mem(), _vloop);
   if (p1.base() != p2.base() || !p1.comparable(p2)) return false;
   int diff = p2.offset_in_bytes() - p1.offset_in_bytes();
   return diff == data_size(s1);
@@ -1602,8 +1603,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac
   assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs");
 
   const MemNode* mem_ref = pack->at(0)->as_Mem();
-  VPointer mem_ref_p(mem_ref, vloop());
-  const CountedLoopEndNode* pre_end = vloop().pre_loop_end();
+  VPointer mem_ref_p(mem_ref, _vloop);
+  const CountedLoopEndNode* pre_end = _vloop.pre_loop_end();
   assert(pre_end->stride_is_con(), "pre loop stride is constant");
 
   AlignmentSolver solver(pack->at(0)->as_Mem(),
@@ -2273,7 +2274,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
   // loop we may have a different last store, and we need to adjust the uses accordingly.
   GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
 
-  const GrowableArray<PhiNode*> &mem_slice_head = vloop_analyzer().memory_slices().heads();
+  const GrowableArray<PhiNode*> &mem_slice_head = _vloop_analyzer.memory_slices().heads();
 
   // (1) Set up the initial memory state from Phi. And find the old last store.
   for (int i = 0; i < mem_slice_head.length(); i++) {
@@ -2394,7 +2395,7 @@ bool SuperWord::output() {
         // Walk up the memory chain, and ignore any StoreVector that provably
         // does not have any memory dependency.
         while (mem->is_StoreVector()) {
-          VPointer p_store(mem->as_Mem(), vloop());
+          VPointer p_store(mem->as_Mem(), _vloop);
           if (p_store.overlap_possible_with_any_in(p)) {
             break;
           } else {
@@ -2937,10 +2938,10 @@ VStatus VLoopBody::construct() {
   //  (2) Count number of nodes, and create a temporary map (_idx -> bb_idx).
   //  (3) Verify that all non-ctrl nodes have an input inside the loop.
   int body_count = 0;
-  for (uint i = 0; i < vloop().lpt()->_body.size(); i++) {
-    Node* n = vloop().lpt()->_body.at(i);
+  for (uint i = 0; i < _vloop.lpt()->_body.size(); i++) {
+    Node* n = _vloop.lpt()->_body.at(i);
     set_bb_idx(n, i); // Create a temporary map
-    if (vloop().in_bb(n)) {
+    if (_vloop.in_bb(n)) {
       body_count++;
 
       if (n->is_LoadStore() || n->is_MergeMem() ||
@@ -2948,7 +2949,7 @@ VStatus VLoopBody::construct() {
         // Bailout if the loop has LoadStore, MergeMem or data Proj
         // nodes. Superword optimization does not work with them.
 #ifndef PRODUCT
-        if (vloop().is_trace_body()) {
+        if (_vloop.is_trace_body()) {
           tty->print_cr("VLoopBody::construct: fails because of unhandled node:");
           n->dump();
         }
@@ -2961,7 +2962,7 @@ VStatus VLoopBody::construct() {
         bool found = false;
         for (uint j = 0; j < n->req(); j++) {
           Node* def = n->in(j);
-          if (def != nullptr && vloop().in_bb(def)) {
+          if (def != nullptr && _vloop.in_bb(def)) {
             found = true;
             break;
           }
@@ -2978,8 +2979,8 @@ VStatus VLoopBody::construct() {
   VectorSet visited;
   VectorSet post_visited;
 
-  visited.set(bb_idx(vloop().cl()));
-  stack.push(vloop().cl());
+  visited.set(bb_idx(_vloop.cl()));
+  stack.push(_vloop.cl());
 
   // Do a depth first walk over out edges
   int rpo_idx = body_count - 1;
@@ -2992,9 +2993,9 @@ VStatus VLoopBody::construct() {
       const int old_length = stack.length();
       for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) {
         Node* use = n->fast_out(i);
-        if (vloop().in_bb(use) && !visited.test(bb_idx(use)) &&
+        if (_vloop.in_bb(use) && !visited.test(bb_idx(use)) &&
             // Don't go around backedge
-            (!use->is_Phi() || n == vloop().cl())) {
+            (!use->is_Phi() || n == _vloop.cl())) {
           stack.push(use);
         }
       }
@@ -3019,7 +3020,7 @@ VStatus VLoopBody::construct() {
   }
 
 #ifndef PRODUCT
-  if (vloop().is_trace_body()) {
+  if (_vloop.is_trace_body()) {
     print();
   }
 #endif
@@ -3120,7 +3121,7 @@ int SuperWord::max_vector_size_in_def_use_chain(Node* n) {
 
 void VLoopTypes::compute_vector_element_type() {
 #ifndef PRODUCT
-  if (vloop().is_trace_vector_element_type()) {
+  if (_vloop.is_trace_vector_element_type()) {
     tty->print_cr("\nVLoopTypes::compute_vector_element_type:");
   }
 #endif
@@ -3151,13 +3152,13 @@ void VLoopTypes::compute_vector_element_type() {
         Node* in  = n->in(j);
         // Don't propagate through a memory
         if (!in->is_Mem() &&
-            vloop().in_bb(in) &&
+            _vloop.in_bb(in) &&
             velt_type(in)->basic_type() == T_INT &&
             data_size(n) < data_size(in)) {
           bool same_type = true;
           for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) {
             Node *use = in->fast_out(k);
-            if (!vloop().in_bb(use) || !same_velt_type(use, n)) {
+            if (!_vloop.in_bb(use) || !same_velt_type(use, n)) {
               same_type = false;
               break;
             }
@@ -3175,7 +3176,7 @@ void VLoopTypes::compute_vector_element_type() {
             if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) {
               Node* load = in->in(1);
               if (load->is_Load() &&
-                  vloop().in_bb(load) &&
+                  _vloop.in_bb(load) &&
                   (velt_type(load)->basic_type() == T_INT)) {
                 // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned
                 // (LoadUS/LoadUB) values. Store nodes only have one version.
@@ -3200,9 +3201,9 @@ void VLoopTypes::compute_vector_element_type() {
       assert(nn->is_Cmp(), "always have Cmp above Bool");
     }
     if (nn->is_Cmp() && nn->in(0) == nullptr) {
-      assert(vloop().in_bb(nn->in(1)) || vloop().in_bb(nn->in(2)),
+      assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)),
              "one of the inputs must be in the loop too");
-      if (vloop().in_bb(nn->in(1))) {
+      if (_vloop.in_bb(nn->in(1))) {
         set_velt_type(n, velt_type(nn->in(1)));
       } else {
         set_velt_type(n, velt_type(nn->in(2)));
@@ -3210,7 +3211,7 @@ void VLoopTypes::compute_vector_element_type() {
     }
   }
 #ifndef PRODUCT
-  if (vloop().is_trace_vector_element_type()) {
+  if (_vloop.is_trace_vector_element_type()) {
     for (int i = 0; i < body.length(); i++) {
       Node* n = body.at(i);
       velt_type(n)->dump();
@@ -3229,7 +3230,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) {
     tty->print("SuperWord::memory_alignment within a vector memory reference for %d:  ", s->_idx); s->dump();
   }
 #endif
-  VPointer p(s, vloop());
+  VPointer p(s, _vloop);
   if (!p.valid()) {
     NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");)
     return bottom_align;
@@ -3269,7 +3270,7 @@ const Type* VLoopTypes::container_type(Node* n) const {
     }
     return Type::get_const_basic_type(bt);
   }
-  const Type* t = vloop().phase()->igvn().type(n);
+  const Type* t = _vloop.phase()->igvn().type(n);
   if (t->basic_type() == T_INT) {
     // A narrow type of arithmetic operations will be determined by
     // propagating the type of memory operations.
@@ -3279,8 +3280,8 @@ const Type* VLoopTypes::container_type(Node* n) const {
 }
 
 bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const {
-  return vloop().phase()->C->get_alias_index(m1->adr_type()) ==
-         vloop().phase()->C->get_alias_index(m2->adr_type());
+  return _vloop.phase()->C->get_alias_index(m1->adr_type()) ==
+         _vloop.phase()->C->get_alias_index(m2->adr_type());
 }
 
 //------------------------------in_packset---------------------------
@@ -3363,19 +3364,19 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() {
   assert(cl()->is_main_loop(), "can only do alignment for main loop");
 
   // The opaque node for the limit, where we adjust the input
-  Opaque1Node* pre_opaq = vloop().pre_loop_end()->limit()->as_Opaque1();
+  Opaque1Node* pre_opaq = _vloop.pre_loop_end()->limit()->as_Opaque1();
 
   // Current pre-loop limit.
   Node* old_limit = pre_opaq->in(1);
 
   // Where we put new limit calculations.
-  Node* pre_ctrl = vloop().pre_loop_head()->in(LoopNode::EntryControl);
+  Node* pre_ctrl = _vloop.pre_loop_head()->in(LoopNode::EntryControl);
 
   // Ensure the original loop limit is available from the pre-loop Opaque1 node.
   Node* orig_limit = pre_opaq->original_loop_limit();
   assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, "");
 
-  VPointer align_to_ref_p(align_to_ref, vloop());
+  VPointer align_to_ref_p(align_to_ref, _vloop);
   assert(align_to_ref_p.valid(), "sanity");
 
   // For the main-loop, we want the address of align_to_ref to be memory aligned
diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index af9fae308438d..ca3c4284f9ee6 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -198,6 +198,7 @@ class SWNodeInfo {
 class SuperWord : public ResourceObj {
  private:
   const VLoopAnalyzer& _vloop_analyzer;
+  const VLoop&         _vloop;
 
   // Arena for small data structures. Large data structures are allocated in
   // VSharedData, and reused over many AutoVectorizations.
@@ -222,118 +223,114 @@ class SuperWord : public ResourceObj {
   // Decide if loop can eventually be vectorized, and what unrolling factor is required.
   static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
 
-  // VLoopAnalyzer Accessors
-  const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; }
-
   // VLoop Accessors
-  const VLoop& vloop()                  const { return vloop_analyzer().vloop(); }
-  PhaseIdealLoop* phase()               const { return vloop().phase(); }
-  PhaseIterGVN& igvn()                  const { return vloop().phase()->igvn(); }
-  IdealLoopTree* lpt()                  const { return vloop().lpt(); }
-  CountedLoopNode* cl()                 const { return vloop().cl(); }
-  PhiNode* iv()                         const { return vloop().iv(); }
+  PhaseIdealLoop* phase()               const { return _vloop.phase(); }
+  PhaseIterGVN& igvn()                  const { return _vloop.phase()->igvn(); }
+  IdealLoopTree* lpt()                  const { return _vloop.lpt(); }
+  CountedLoopNode* cl()                 const { return _vloop.cl(); }
+  PhiNode* iv()                         const { return _vloop.iv(); }
   int iv_stride()                       const { return cl()->stride_con(); }
-  bool in_bb(const Node* n)             const { return vloop().in_bb(n); }
+  bool in_bb(const Node* n)             const { return _vloop.in_bb(n); }
 
   // VLoopReductions Accessors
   bool is_marked_reduction(const Node* n) const {
-    return vloop_analyzer().reductions().is_marked_reduction(n);
+    return _vloop_analyzer.reductions().is_marked_reduction(n);
   }
 
   bool reduction(Node* n1, Node* n2) const {
-    return vloop_analyzer().reductions().is_marked_reduction_pair(n1, n2);
+    return _vloop_analyzer.reductions().is_marked_reduction_pair(n1, n2);
   }
 
   // VLoopMemorySlices Accessors
   bool same_memory_slice(MemNode* n1, MemNode* n2) const {
-    return vloop_analyzer().memory_slices().same_memory_slice(n1, n2);
+    return _vloop_analyzer.memory_slices().same_memory_slice(n1, n2);
   }
 
   // VLoopBody Accessors
   const GrowableArray<Node*>& body() const {
-    return vloop_analyzer().body().body();
+    return _vloop_analyzer.body().body();
   }
 
   int bb_idx(const Node* n) const     {
-    return vloop_analyzer().body().bb_idx(n);
+    return _vloop_analyzer.body().bb_idx(n);
   }
 
   // VLoopTypes Accessors
   const Type* velt_type(Node* n) const {
-    return vloop_analyzer().types().velt_type(n);
+    return _vloop_analyzer.types().velt_type(n);
   }
 
   BasicType velt_basic_type(Node* n) const {
-    return vloop_analyzer().types().velt_basic_type(n);
+    return _vloop_analyzer.types().velt_basic_type(n);
   }
 
   bool same_velt_type(Node* n1, Node* n2) const {
-    return vloop_analyzer().types().same_velt_type(n1, n2);
+    return _vloop_analyzer.types().same_velt_type(n1, n2);
   }
 
   int data_size(Node* n) const {
-    return vloop_analyzer().types().data_size(n);
+    return _vloop_analyzer.types().data_size(n);
   }
 
   int vector_width(Node* n) const {
-    return vloop_analyzer().types().vector_width(n);
+    return _vloop_analyzer.types().vector_width(n);
   }
 
   int vector_width_in_bytes(const Node* n) const {
-    return vloop_analyzer().types().vector_width_in_bytes(n);
+    return _vloop_analyzer.types().vector_width_in_bytes(n);
   }
 
 #ifndef PRODUCT
   // TraceAutoVectorization and TraceSuperWord
   bool is_trace_superword_alignment() const {
     // Too verbose for TraceSuperWord
-    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
+    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT);
   }
 
   bool is_trace_superword_dependence_graph() const {
     return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH);
   }
 
   bool is_trace_superword_adjacent_memops() const {
     return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS);
   }
 
   bool is_trace_superword_rejections() const {
     return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS);
   }
 
   bool is_trace_superword_packset() const {
     return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET);
   }
 
   bool is_trace_superword_info() const {
     return TraceSuperWord ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO);
   }
 
   bool is_trace_superword_verbose() const {
     // Too verbose for TraceSuperWord
-    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
+    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
   }
 
   bool is_trace_superword_any() const {
     return TraceSuperWord ||
            is_trace_align_vector() ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) ||
-           vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) ||
+           _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE);
   }
 
   bool is_trace_align_vector() const {
-    return vloop().vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
+    return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) ||
            is_trace_superword_verbose();
   }
 #endif
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index 5f268bced090f..d6554670d16f6 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -115,17 +115,17 @@ VStatus VLoop::check_preconditions_helper() {
 // Return true iff all submodules are loaded successfully
 bool VLoopAnalyzer::setup_submodules() {
 #ifndef PRODUCT
-  if (vloop().is_trace_loop_analyzer()) {
+  if (_vloop.is_trace_loop_analyzer()) {
     tty->print_cr("\nVLoopAnalyzer::setup_submodules");
-    vloop().lpt()->dump_head();
-    vloop().cl()->dump();
+    _vloop.lpt()->dump_head();
+    _vloop.cl()->dump();
   }
 #endif
 
   VStatus status = setup_submodules_helper();
   if (!status.is_success()) {
 #ifndef PRODUCT
-    if (vloop().is_trace_loop_analyzer()) {
+    if (_vloop.is_trace_loop_analyzer()) {
       tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason());
     }
 #endif
@@ -136,7 +136,7 @@ bool VLoopAnalyzer::setup_submodules() {
 
 VStatus VLoopAnalyzer::setup_submodules_helper() {
   // Skip any loop that has not been assigned max unroll by analysis.
-  if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) {
+  if (SuperWordLoopUnrollAnalysis && _vloop.cl()->slp_max_unroll() == 0) {
     return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL);
   }
 
@@ -284,7 +284,7 @@ bool VPointer::invariant(Node* n) const {
       // main loop (Illegal invariant happens when n_c is a CastII node that
       // prevents data nodes to flow above the main loop).
       Node* n_c = phase()->get_ctrl(n);
-      return phase()->is_dominator(n_c, vloop().pre_loop_head());
+      return phase()->is_dominator(n_c, _vloop.pre_loop_head());
     }
   }
   return is_not_member;
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index 827a9a3c26df5..a4cd3579e3828 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -130,27 +130,27 @@ class VLoop : public StackObj {
   const VTrace& vtrace()      const { return _vtrace; }
 
   bool is_trace_preconditions() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::PRECONDITIONS);
   }
 
   bool is_trace_loop_analyzer() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER);
   }
 
   bool is_trace_memory_slices() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::MEMORY_SLICES);
   }
 
   bool is_trace_body() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::BODY);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::BODY);
   }
 
   bool is_trace_vector_element_type() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::TYPES);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::TYPES);
   }
 
   bool is_trace_pointer_analysis() const {
-    return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
+    return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS);
   }
 #endif
 
@@ -221,7 +221,6 @@ class VLoopReductions : public StackObj {
   NONCOPYABLE(VLoopReductions);
 
 private:
-  const VLoop& vloop() const { return _vloop; }
   // Search for a path P = (n_1, n_2, ..., n_k) such that:
   // - original_input(n_i, input) = n_i+1 for all 1 <= i < k,
   // - path(n) for all n in P,
@@ -295,8 +294,6 @@ class VLoopMemorySlices : public StackObj {
   GrowableArray<PhiNode*> _heads;
   GrowableArray<MemNode*> _tails;
 
-  const VLoop& vloop() const { return _vloop; }
-
 public:
   VLoopMemorySlices(Arena* arena, const VLoop& vloop) :
     _vloop(vloop),
@@ -336,8 +333,6 @@ class VLoopBody : public StackObj {
   // Can be very large, and thus lives in VSharedData
   GrowableArray<int>&  _body_idx;
 
-  const VLoop& vloop() const { return _vloop; }
-
 public:
   VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) :
     _vloop(vloop),
@@ -384,9 +379,6 @@ class VLoopTypes : public StackObj {
   // bb_idx -> vector element type
   GrowableArray<const Type*> _velt_type;
 
-  const VLoop& vloop() const    { return _vloop; }
-  const VLoopBody& body() const { return _body; }
-
 public:
   VLoopTypes(Arena* arena,
              const VLoop& vloop,
@@ -400,8 +392,8 @@ class VLoopTypes : public StackObj {
   NOT_PRODUCT( void print() const; )
 
   const Type* velt_type(const Node* n) const {
-    assert(vloop().in_bb(n), "only call on nodes in loop");
-    const Type* t = _velt_type.at(body().bb_idx(n));
+    assert(_vloop.in_bb(n), "only call on nodes in loop");
+    const Type* t = _velt_type.at(_body.bb_idx(n));
     assert(t != nullptr, "must have type");
     return t;
   }
@@ -439,8 +431,8 @@ class VLoopTypes : public StackObj {
 private:
   void set_velt_type(Node* n, const Type* t) {
     assert(t != nullptr, "cannot set nullptr");
-    assert(vloop().in_bb(n), "only call on nodes in loop");
-    _velt_type.at_put(body().bb_idx(n), t);
+    assert(_vloop.in_bb(n), "only call on nodes in loop");
+    _velt_type.at_put(_body.bb_idx(n), t);
   }
 
   // Smallest type containing range of values
@@ -478,7 +470,7 @@ class VLoopAnalyzer : StackObj {
     _reductions      (&_arena, vloop),
     _memory_slices   (&_arena, vloop),
     _body            (&_arena, vloop, vshared),
-    _types           (&_arena, vloop, body())
+    _types           (&_arena, vloop, _body)
   {
     _success = setup_submodules();
   }
@@ -522,10 +514,9 @@ class VPointer : public ArenaObj {
   bool        _analyze_only; // Used in loop unrolling only for vpointer trace
   uint        _stack_idx;    // Used in loop unrolling only for vpointer trace
 
-  const VLoop&    vloop() const { return _vloop; }
-  PhaseIdealLoop* phase() const { return vloop().phase(); }
-  IdealLoopTree*  lpt() const   { return vloop().lpt(); }
-  PhiNode*        iv() const    { return vloop().iv(); }
+  PhaseIdealLoop* phase() const { return _vloop.phase(); }
+  IdealLoopTree*  lpt() const   { return _vloop.lpt(); }
+  PhiNode*        iv() const    { return _vloop.iv(); }
 
   bool is_loop_member(Node* n) const;
   bool invariant(Node* n) const;
@@ -598,7 +589,7 @@ class VPointer : public ArenaObj {
   bool overlap_possible_with_any_in(Node_List* p) {
     for (uint k = 0; k < p->size(); k++) {
       MemNode* mem = p->at(k)->as_Mem();
-      VPointer p_mem(mem, vloop());
+      VPointer p_mem(mem, _vloop);
       // Only if we know that we have Less or Greater can we
       // be sure that there can never be an overlap between
       // the two memory regions.

From 77695f059a3ce581ee84c69fafd3bccfc656d2fb Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Thu, 15 Feb 2024 23:53:32 +0100
Subject: [PATCH 12/13] indentation

---
 src/hotspot/share/opto/superword.hpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp
index ca3c4284f9ee6..00a8c915ac7fb 100644
--- a/src/hotspot/share/opto/superword.hpp
+++ b/src/hotspot/share/opto/superword.hpp
@@ -224,13 +224,13 @@ class SuperWord : public ResourceObj {
   static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor);
 
   // VLoop Accessors
-  PhaseIdealLoop* phase()               const { return _vloop.phase(); }
-  PhaseIterGVN& igvn()                  const { return _vloop.phase()->igvn(); }
-  IdealLoopTree* lpt()                  const { return _vloop.lpt(); }
-  CountedLoopNode* cl()                 const { return _vloop.cl(); }
-  PhiNode* iv()                         const { return _vloop.iv(); }
-  int iv_stride()                       const { return cl()->stride_con(); }
-  bool in_bb(const Node* n)             const { return _vloop.in_bb(n); }
+  PhaseIdealLoop* phase()     const { return _vloop.phase(); }
+  PhaseIterGVN& igvn()        const { return _vloop.phase()->igvn(); }
+  IdealLoopTree* lpt()        const { return _vloop.lpt(); }
+  CountedLoopNode* cl()       const { return _vloop.cl(); }
+  PhiNode* iv()               const { return _vloop.iv(); }
+  int iv_stride()             const { return cl()->stride_con(); }
+  bool in_bb(const Node* n)   const { return _vloop.in_bb(n); }
 
   // VLoopReductions Accessors
   bool is_marked_reduction(const Node* n) const {

From 9c70b2d63135e3f26db0d64b76e75de1c570960c Mon Sep 17 00:00:00 2001
From: Emanuel Peter <emanuel.peter@oracle.com>
Date: Mon, 26 Feb 2024 10:09:23 +0100
Subject: [PATCH 13/13] review updates for Christian

---
 src/hotspot/share/opto/superword.cpp     | 14 +++++++-------
 src/hotspot/share/opto/vectorization.cpp |  2 +-
 src/hotspot/share/opto/vectorization.hpp | 10 +++++-----
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp
index e4cce5f67fa9e..75b5e53f2792d 100644
--- a/src/hotspot/share/opto/superword.cpp
+++ b/src/hotspot/share/opto/superword.cpp
@@ -777,7 +777,7 @@ void SuperWord::dependence_graph() {
     MemNode* tail = mem_slice_tail.at(i);
 
     // Get slice in predecessor order (last is first)
-    _vloop_analyzer.memory_slices().get_slice(head, tail, slice_nodes);
+    _vloop_analyzer.memory_slices().get_slice_in_reverse_order(head, tail, slice_nodes);
 
     // Make the slice dependent on the root
     DepMem* slice = _dg.dep(head);
@@ -861,8 +861,8 @@ void VLoopMemorySlices::print() const {
 #endif
 
 // Get all memory nodes of a slice, in reverse order
-void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const {
-  assert(slice.length() == 0, "start empty");
+void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const {
+  assert(slice.is_empty(), "start empty");
   Node* n = tail;
   Node* prev = nullptr;
   while (true) {
@@ -897,7 +897,7 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray<No
 
 #ifndef PRODUCT
   if (_vloop.is_trace_memory_slices()) {
-    tty->print_cr("\nVLoopMemorySlices::get_slice:");
+    tty->print_cr("\nVLoopMemorySlices::get_slice_in_reverse_order:");
     head->dump();
     for (int j = slice.length() - 1; j >= 0 ; j--) {
       slice.at(j)->dump();
@@ -2274,7 +2274,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) {
   // loop we may have a different last store, and we need to adjust the uses accordingly.
   GrowableArray<Node*> old_last_store_in_slice(max_slices, max_slices, nullptr);
 
-  const GrowableArray<PhiNode*> &mem_slice_head = _vloop_analyzer.memory_slices().heads();
+  const GrowableArray<PhiNode*>& mem_slice_head = _vloop_analyzer.memory_slices().heads();
 
   // (1) Set up the initial memory state from Phi. And find the old last store.
   for (int i = 0; i < mem_slice_head.length(); i++) {
@@ -2931,7 +2931,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) {
 
 // Return nullptr if success, else failure message
 VStatus VLoopBody::construct() {
-  assert(_body.length() == 0, "body is empty");
+  assert(_body.is_empty(), "body is empty");
 
   // First pass over loop body:
   //  (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj).
@@ -3202,7 +3202,7 @@ void VLoopTypes::compute_vector_element_type() {
     }
     if (nn->is_Cmp() && nn->in(0) == nullptr) {
       assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)),
-             "one of the inputs must be in the loop too");
+             "one of the inputs must be in the loop, too");
       if (_vloop.in_bb(nn->in(1))) {
         set_velt_type(n, velt_type(nn->in(1)));
       } else {
diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp
index d6554670d16f6..d8d4e03210635 100644
--- a/src/hotspot/share/opto/vectorization.cpp
+++ b/src/hotspot/share/opto/vectorization.cpp
@@ -149,7 +149,7 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
   // If there is no memory slice detected, it means there is no store.
   // If there is no reduction and no store, then we give up, because
   // vectorization is not possible anyway (given current limitations).
-  if (!reductions().is_marked_reduction_loop() &&
+  if (!_reductions.is_marked_reduction_loop() &&
       _memory_slices.heads().is_empty()) {
     return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE);
   }
diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp
index a4cd3579e3828..3f897010d9db1 100644
--- a/src/hotspot/share/opto/vectorization.hpp
+++ b/src/hotspot/share/opto/vectorization.hpp
@@ -303,11 +303,11 @@ class VLoopMemorySlices : public StackObj {
 
   void find_memory_slices();
 
-  const GrowableArray<PhiNode*> &heads() const { return _heads; }
-  const GrowableArray<MemNode*> &tails() const { return _tails; }
+  const GrowableArray<PhiNode*>& heads() const { return _heads; }
+  const GrowableArray<MemNode*>& tails() const { return _tails; }
 
   // Get all memory nodes of a slice, in reverse order
-  void get_slice(PhiNode* head, MemNode* tail, GrowableArray<Node*> &slice) const;
+  void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<Node*>& slice) const;
 
   bool same_memory_slice(MemNode* m1, MemNode* m2) const;
 
@@ -331,7 +331,7 @@ class VLoopBody : public StackObj {
 
   // Mapping node->_idx -> body_idx
   // Can be very large, and thus lives in VSharedData
-  GrowableArray<int>&  _body_idx;
+  GrowableArray<int>& _body_idx;
 
 public:
   VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) :
@@ -463,7 +463,7 @@ class VLoopAnalyzer : StackObj {
   VLoopTypes           _types;
 
 public:
-  VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) :
+  VLoopAnalyzer(const VLoop& vloop, VSharedData& vshared) :
     _vloop(vloop),
     _arena(mtCompiler),
     _success(false),