From 3d7bca8358c292c787bc31ed91ebb7b5e14b2cff Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 18:52:23 +0100 Subject: [PATCH 01/13] 8325589 --- src/hotspot/share/opto/loopopts.cpp | 5 ++ src/hotspot/share/opto/superword.cpp | 10 ---- .../share/opto/traceAutoVectorizationTag.hpp | 1 + src/hotspot/share/opto/vectorization.cpp | 34 +++++++++++++ src/hotspot/share/opto/vectorization.hpp | 50 +++++++++++++++++++ 5 files changed, 90 insertions(+), 10 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index c5d8ed39d9d0c..db8e44f08e92d 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4232,6 +4232,11 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { // Ensure the shared data is cleared before each use vshared.clear(); + const VLoopAnalyzer vloop_analyzer(vloop, vshared); + if (!vloop_analyzer.success()) { + return AutoVectorizeStatus::TriedAndFailed; + } + SuperWord sw(vloop, vshared); if (!sw.transform_loop()) { return AutoVectorizeStatus::TriedAndFailed; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index aa1edd01ab19e..10589d31945c2 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -399,16 +399,6 @@ bool SuperWord::transform_loop() { } #endif - // Skip any loop that has not been assigned max unroll by analysis - if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) { -#ifndef PRODUCT - if (is_trace_superword_any()) { - tty->print_cr("\nSuperWord::transform_loop failed: slp max unroll analysis was not already done"); - } -#endif - return false; - } - if (!SLP_extract()) { #ifndef PRODUCT if (is_trace_superword_any()) { diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 79157aca309d6..78f1301010aae 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -31,6 +31,7 @@ #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \ flags(POINTER_ANALYSIS, "Trace VPointer") \ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ + flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ flags(SW_MEMORY_SLICES, "Trace SuperWord memory slices") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 98b996339fa46..a867561c1aa45 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -114,6 +114,40 @@ const char* VLoop::check_preconditions_helper() { return VLoop::SUCCESS; } +// Return true iff all submodules are loaded successfully +bool VLoopAnalyzer::setup_submodules() { +#ifndef PRODUCT + if (vloop().is_trace_loop_analyzer()) { + tty->print_cr("\nVLoopAnalyzer::setup_submodules"); + vloop().lpt()->dump_head(); + vloop().cl()->dump(); + } +#endif + + const char* state = setup_submodules_helper(); + if (state == VLoopAnalyzer::SUCCESS) { + return true; // success + } + +#ifndef PRODUCT + if (vloop().is_trace_loop_analyzer()) { + tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", state); + } +#endif + return false; // failed +} + +// Return SUCCESS string iff all submodules are setup successfully +const char* VLoopAnalyzer::setup_submodules_helper() { + // Skip any loop that has not been assigned max unroll by analysis. + if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) { + return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL; + } + + // TODO + return VLoopAnalyzer::SUCCESS; +} + #ifndef PRODUCT int VPointer::Tracer::_depth = 0; #endif diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 7aff58db4bb30..6106cb8547ccf 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -111,6 +111,10 @@ class VLoop : public StackObj { return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS); } + bool is_trace_loop_analyzer() const { + return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER); + } + bool is_trace_pointer_analysis() const { return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -166,6 +170,52 @@ class VSharedData : public StackObj { } }; +// TODO submodules + +// Analyze the loop in preparation for auto-vectorization. This class is +// deliberately structured into many submodules, which are as independent +// as possible, though some submodules do require other submodules. +class VLoopAnalyzer : StackObj { +private: + // TODO check if all are really needed + static constexpr char const* SUCCESS = "success"; + static constexpr char const* FAILURE_NO_MAX_UNROLL = "slp max unroll analysis required"; + static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop"; + + const VLoop& _vloop; + + // Arena for all submodules + Arena _arena; + + // If all submodules are setup successfully, we set this flag at the + // end of the constructor + bool _success; + + // Submodules + // TODO + +public: + VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) : + _vloop(vloop), + _arena(mtCompiler), + _success(false) + // TODO modules + { + _success = setup_submodules(); + } + NONCOPYABLE(VLoopAnalyzer); + + bool success() const { return _success; } + + // Read-only accessors for submodules + const VLoop& vloop() const { return _vloop; } + // TODO + +private: + bool setup_submodules(); + const char* setup_submodules_helper(); +}; + // A vectorization pointer (VPointer) has information about an address for // dependence checking and vector alignment. It's usually bound to a memory // operation in a counted loop for vectorizable analysis. From 4fcbad64628b70ad7935a88d53802758c59d397d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 19:52:43 +0100 Subject: [PATCH 02/13] VLoopReductions --- src/hotspot/cpu/x86/x86_64.ad | 16 ++-- src/hotspot/share/opto/loopopts.cpp | 2 +- src/hotspot/share/opto/superword.cpp | 84 ++++++++------------- src/hotspot/share/opto/superword.hpp | 93 ++++++------------------ src/hotspot/share/opto/vectorization.cpp | 14 ++++ src/hotspot/share/opto/vectorization.hpp | 84 ++++++++++++++++++++- 6 files changed, 158 insertions(+), 135 deletions(-) diff --git a/src/hotspot/cpu/x86/x86_64.ad b/src/hotspot/cpu/x86/x86_64.ad index a248daaa1917b..eb063c9563a2a 100644 --- a/src/hotspot/cpu/x86/x86_64.ad +++ b/src/hotspot/cpu/x86/x86_64.ad @@ -4480,7 +4480,7 @@ instruct loadD(regD dst, memory mem) // max = java.lang.Math.max(float a, float b) instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ - predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n)); match(Set dst (MaxF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ "maxF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %} @@ -4491,7 +4491,7 @@ instruct maxF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, %} instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && VLoopReductions::is_reduction(n)); match(Set dst (MaxF a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -4505,7 +4505,7 @@ instruct maxF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe // max = java.lang.Math.max(double a, double b) instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ - predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n)); match(Set dst (MaxD a b)); effect(USE a, USE b, TEMP atmp, TEMP btmp, TEMP tmp); format %{ "maxD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %} @@ -4516,7 +4516,7 @@ instruct maxD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, %} instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && VLoopReductions::is_reduction(n)); match(Set dst (MaxD a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -4530,7 +4530,7 @@ instruct maxD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRe // min = java.lang.Math.min(float a, float b) instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, legRegF btmp) %{ - predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n)); match(Set dst (MinF a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ "minF $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %} @@ -4541,7 +4541,7 @@ instruct minF_reg(legRegF dst, legRegF a, legRegF b, legRegF tmp, legRegF atmp, %} instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRegI tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && VLoopReductions::is_reduction(n)); match(Set dst (MinF a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); @@ -4555,7 +4555,7 @@ instruct minF_reduction_reg(legRegF dst, legRegF a, legRegF b, legRegF xmmt, rRe // min = java.lang.Math.min(double a, double b) instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, legRegD btmp) %{ - predicate(UseAVX > 0 && !SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && !VLoopReductions::is_reduction(n)); match(Set dst (MinD a b)); effect(USE a, USE b, TEMP tmp, TEMP atmp, TEMP btmp); format %{ "minD $dst, $a, $b \t! using tmp, atmp and btmp as TEMP" %} @@ -4566,7 +4566,7 @@ instruct minD_reg(legRegD dst, legRegD a, legRegD b, legRegD tmp, legRegD atmp, %} instruct minD_reduction_reg(legRegD dst, legRegD a, legRegD b, legRegD xmmt, rRegL tmp, rFlagsReg cr) %{ - predicate(UseAVX > 0 && SuperWord::is_reduction(n)); + predicate(UseAVX > 0 && VLoopReductions::is_reduction(n)); match(Set dst (MinD a b)); effect(USE a, USE b, TEMP xmmt, TEMP tmp, KILL cr); diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index db8e44f08e92d..406158eaee42f 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4237,7 +4237,7 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { return AutoVectorizeStatus::TriedAndFailed; } - SuperWord sw(vloop, vshared); + SuperWord sw(vloop_analyzer, vshared); if (!sw.transform_loop()) { return AutoVectorizeStatus::TriedAndFailed; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 10589d31945c2..3e7aeb2432a7b 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -38,20 +38,19 @@ #include "opto/movenode.hpp" #include "utilities/powerOfTwo.hpp" -SuperWord::SuperWord(const VLoop &vloop, VSharedData &vshared) : - _vloop(vloop), +SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared) : + _vloop_analyzer(vloop_analyzer), _arena(mtCompiler), _packset(arena(), 8, 0, nullptr), // packs for the current block _bb_idx(vshared.node_idx_to_loop_body_idx()), // node idx to index in bb - _block(arena(), vloop.estimated_body_length(), 0, nullptr), // nodes in current block + _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block _mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads _mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails - _node_info(arena(), vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node + _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node _clone_map(phase()->C->clone_map()), // map of nodes created in cloning _align_to_ref(nullptr), // memory reference to align vectors to _dg(arena()), // dependence graph - _nlist(arena(), vloop.estimated_body_length(), 0, nullptr), // scratch list of nodes - _loop_reductions(arena()), // reduction nodes in the current loop + _nlist(arena(), vloop().estimated_body_length(), 0, nullptr), // scratch list of nodes _race_possible(false), // cases where SDMU is true _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style _num_work_vecs(0), // amount of vector work we have @@ -255,7 +254,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa } } -bool SuperWord::is_reduction(const Node* n) { +bool VLoopReductions::is_reduction(const Node* n) { if (!is_reduction_operator(n)) { return false; } @@ -269,12 +268,12 @@ bool SuperWord::is_reduction(const Node* n) { return false; } -bool SuperWord::is_reduction_operator(const Node* n) { +bool VLoopReductions::is_reduction_operator(const Node* n) { int opc = n->Opcode(); return (opc != ReductionNode::opcode(opc, n->bottom_type()->basic_type())); } -bool SuperWord::in_reduction_cycle(const Node* n, uint input) { +bool VLoopReductions::in_reduction_cycle(const Node* n, uint input) { // First find input reduction path to phi node. auto has_my_opcode = [&](const Node* m){ return m->Opcode() == n->Opcode(); }; PathEnd path_to_phi = find_in_path(n, input, LoopMaxUnroll, has_my_opcode, @@ -291,7 +290,7 @@ bool SuperWord::in_reduction_cycle(const Node* n, uint input) { return path_from_phi.first != nullptr; } -Node* SuperWord::original_input(const Node* n, uint i) { +Node* VLoopReductions::original_input(const Node* n, uint i) { if (n->has_swapped_edges()) { assert(n->is_Add() || n->is_Mul(), "n should be commutative"); if (i == 1) { @@ -303,21 +302,21 @@ Node* SuperWord::original_input(const Node* n, uint i) { return n->in(i); } -void SuperWord::mark_reductions() { - - _loop_reductions.clear(); +void VLoopReductions::mark_reductions() { + assert(_loop_reductions.is_empty(), "must not yet be computed"); + CountedLoopNode* cl = vloop().cl(); // Iterate through all phi nodes associated to the loop and search for // reduction cycles in the basic block. - for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) { - const Node* phi = cl()->fast_out(i); + for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) { + const Node* phi = cl->fast_out(i); if (!phi->is_Phi()) { continue; } if (phi->outcnt() == 0) { continue; } - if (phi == iv()) { + if (phi == vloop().iv()) { continue; } // The phi's loop-back is considered the first node in the reduction cycle. @@ -341,8 +340,9 @@ void SuperWord::mark_reductions() { // to the phi node following edge index 'input'. PathEnd path = find_in_path( - first, input, lpt()->_body.size(), - [&](const Node* n) { return n->Opcode() == first->Opcode() && in_bb(n); }, + first, input, vloop().lpt()->_body.size(), + [&](const Node* n) { return n->Opcode() == first->Opcode() && + vloop().in_bb(n); }, [&](const Node* n) { return n == phi; }); if (path.first != nullptr) { reduction_input = input; @@ -361,7 +361,7 @@ void SuperWord::mark_reductions() { for (int i = 0; i < path_nodes; i++) { for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) { Node* u = current->fast_out(j); - if (!in_bb(u)) { + if (!vloop().in_bb(u)) { continue; } if (u == succ) { @@ -381,6 +381,7 @@ void SuperWord::mark_reductions() { } // Reduction cycle found. Mark all nodes in the found path as reductions. current = first; + // TODO trace this for (int i = 0; i < path_nodes; i++) { _loop_reductions.set(current->_idx); current = original_input(current, reduction_input); @@ -453,24 +454,11 @@ bool SuperWord::transform_loop() { bool SuperWord::SLP_extract() { assert(cl()->is_main_loop(), "SLP should only work on main loops"); - if (SuperWordReductions) { - mark_reductions(); - } + // TODO remove all the VLoopAnalyzer stuff // Find memory slices find_memory_slices(); - if (!is_marked_reduction_loop() && - _mem_slice_head.is_empty()) { -#ifndef PRODUCT - if (is_trace_superword_any()) { - tty->print_cr("\nNo reductions or memory slices found, abort SuperWord."); - tty->cr(); - } -#endif - return false; - } - // Ready the block if (!construct_bb()) { #ifndef PRODUCT @@ -1120,26 +1108,19 @@ bool SuperWord::have_similar_inputs(Node* s1, Node* s2) { return true; } -//------------------------------reduction--------------------------- -// Is there a data path between s1 and s2 and the nodes reductions? -bool SuperWord::reduction(Node* s1, Node* s2) { - bool retValue = false; - int d1 = depth(s1); - int d2 = depth(s2); - if (d2 > d1) { - if (is_marked_reduction(s1) && is_marked_reduction(s2)) { - // This is an ordered set, so s1 should define s2 - for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { - Node* t1 = s1->fast_out(i); - if (t1 == s2) { - // both nodes are reductions and connected - retValue = true; - } +bool VLoopReductions::is_marked_reduction_pair(Node* s1, Node* s2) const { + if (is_marked_reduction(s1) && + is_marked_reduction(s2)) { + // This is an ordered set, so s1 should define s2 + for (DUIterator_Fast imax, i = s1->fast_outs(imax); i < imax; i++) { + Node* t1 = s1->fast_out(i); + if (t1 == s2) { + // both nodes are reductions and connected + return true; } } } - - return retValue; + return false; } //------------------------------set_alignment--------------------------- @@ -1876,9 +1857,8 @@ bool SuperWord::profitable(Node_List* p) { Node* second_in = p0->in(2); Node_List* second_pk = my_pack(second_in); if ((second_pk == nullptr) || (_num_work_vecs == _num_reductions)) { - // Unmark reduction if no parent pack or if not enough work + // No parent pack or not enough work // to cover reduction expansion overhead - _loop_reductions.remove(p0->_idx); return false; } else if (second_pk->size() != p->size()) { return false; diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 691aa97928a2c..06ba6653d2766 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -26,7 +26,6 @@ #include "opto/vectorization.hpp" #include "utilities/growableArray.hpp" -#include "utilities/pair.hpp" // // S U P E R W O R D T R A N S F O R M @@ -199,7 +198,7 @@ class SWNodeInfo { // Transforms scalar operations into packed (superword) operations. class SuperWord : public ResourceObj { private: - const VLoop& _vloop; + const VLoopAnalyzer& _vloop_analyzer; // Arena for small data structures. Large data structures are allocated in // VSharedData, and reused over many AutoVectorizations. @@ -224,7 +223,7 @@ class SuperWord : public ResourceObj { GrowableArray _nlist; // List of nodes public: - SuperWord(const VLoop &vloop, VSharedData &vshared); + SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared); // Attempt to run the SuperWord algorithm on the loop. Return true if we succeed. bool transform_loop(); @@ -232,15 +231,26 @@ class SuperWord : public ResourceObj { // Decide if loop can eventually be vectorized, and what unrolling factor is required. static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor); + // VLoopAnalyzer Accessors + const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; } + // VLoop Accessors - const VLoop& vloop() const { return _vloop; } - PhaseIdealLoop* phase() const { return vloop().phase(); } - PhaseIterGVN& igvn() const { return vloop().phase()->igvn(); } - IdealLoopTree* lpt() const { return vloop().lpt(); } - CountedLoopNode* cl() const { return vloop().cl(); } - PhiNode* iv() const { return vloop().iv(); } - int iv_stride() const { return cl()->stride_con(); } - bool in_bb(const Node* n) const { return vloop().in_bb(n); } + const VLoop& vloop() const { return vloop_analyzer().vloop(); } + PhaseIdealLoop* phase() const { return vloop().phase(); } + PhaseIterGVN& igvn() const { return vloop().phase()->igvn(); } + IdealLoopTree* lpt() const { return vloop().lpt(); } + CountedLoopNode* cl() const { return vloop().cl(); } + PhiNode* iv() const { return vloop().iv(); } + int iv_stride() const { return cl()->stride_con(); } + bool in_bb(const Node* n) const { return vloop().in_bb(n); } + + // VLoopReductions Accessors + bool is_marked_reduction(const Node* n) const { + return vloop_analyzer().reductions().is_marked_reduction(n); + } + bool reduction(Node* s1, Node* s2) const { + return vloop_analyzer().reductions().is_marked_reduction_pair(s1, s2); + } #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord @@ -315,7 +325,6 @@ class SuperWord : public ResourceObj { const GrowableArray& block() const { return _block; } const DepGraph& dg() const { return _dg; } private: - VectorSet _loop_reductions; // Reduction nodes in the current loop bool _race_possible; // In cases where SDMU is true bool _do_vector_loop; // whether to do vectorization/simd style int _num_work_vecs; // Number of non memory vector operations @@ -376,65 +385,7 @@ class SuperWord : public ResourceObj { bool same_origin_idx(Node* a, Node* b) const; bool same_generation(Node* a, Node* b) const; - // methods - - typedef const Pair PathEnd; - - // Search for a path P = (n_1, n_2, ..., n_k) such that: - // - original_input(n_i, input) = n_i+1 for all 1 <= i < k, - // - path(n) for all n in P, - // - k <= max, and - // - there exists a node e such that original_input(n_k, input) = e and end(e). - // Return , if P is found, or otherwise. - // Note that original_input(n, i) has the same behavior as n->in(i) except - // that it commutes the inputs of binary nodes whose edges have been swapped. - template - static PathEnd find_in_path(const Node *n1, uint input, int max, - NodePredicate1 path, NodePredicate2 end) { - const PathEnd no_path(nullptr, -1); - const Node* current = n1; - int k = 0; - for (int i = 0; i <= max; i++) { - if (current == nullptr) { - return no_path; - } - if (end(current)) { - return PathEnd(current, k); - } - if (!path(current)) { - return no_path; - } - current = original_input(current, input); - k++; - } - return no_path; - } - -public: - // Whether n is a reduction operator and part of a reduction cycle. - // This function can be used for individual queries outside the SLP analysis, - // e.g. to inform matching in target-specific code. Otherwise, the - // almost-equivalent but faster SuperWord::mark_reductions() is preferable. - static bool is_reduction(const Node* n); - // Whether n is marked as a reduction node. - bool is_marked_reduction(Node* n) { return _loop_reductions.test(n->_idx); } - // Whether the current loop has any reduction node. - bool is_marked_reduction_loop() { return !_loop_reductions.is_empty(); } private: - // Whether n is a standard reduction operator. - static bool is_reduction_operator(const Node* n); - // Whether n is part of a reduction cycle via the 'input' edge index. To bound - // the search, constrain the size of reduction cycles to LoopMaxUnroll. - static bool in_reduction_cycle(const Node* n, uint input); - // Reference to the i'th input node of n, commuting the inputs of binary nodes - // whose edges have been swapped. Assumes n is a commutative operation. - static Node* original_input(const Node* n, uint i); - // Find and mark reductions in a loop. Running mark_reductions() is similar to - // querying is_reduction(n) for every n in the SuperWord loop, but stricter in - // that it assumes counted loops and requires that reduction nodes are not - // used within the loop except by their reduction cycle predecessors. - void mark_reductions(); - // Extract the superword level parallelism bool SLP_extract(); // Find the adjacent memory references and create pack pairs for them. void find_adjacent_refs(); @@ -466,8 +417,6 @@ class SuperWord : public ResourceObj { // For a node pair (s1, s2) which is isomorphic and independent, // do s1 and s2 have similar input edges? bool have_similar_inputs(Node* s1, Node* s2); - // Is there a data path between s1 and s2 and both are reductions? - bool reduction(Node* s1, Node* s2); void set_alignment(Node* s1, Node* s2, int align); int data_size(Node* s); // Extend packset by following use->def and def->use links from pack members. diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index a867561c1aa45..a1369da7f4df3 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -144,6 +144,20 @@ const char* VLoopAnalyzer::setup_submodules_helper() { return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL; } + if (SuperWordReductions) { + _reductions.mark_reductions(); + } + + // TODO _memory_slices.analyze(); + + // // If there is no memory slice detected, that means there is no store. + // // If there is no reduction and no store, then we give up, because + // // vectorization is not possible anyway (given current limitations). + // if (!reductions().is_marked_reduction_loop() && + // _memory_slices.heads().is_empty()) { + // return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE; + // } + // TODO return VLoopAnalyzer::SUCCESS; } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 6106cb8547ccf..30c386465fd9f 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -28,6 +28,7 @@ #include "opto/node.hpp" #include "opto/loopnode.hpp" #include "opto/traceAutoVectorizationTag.hpp" +#include "utilities/pair.hpp" // Code in this file and the vectorization.cpp contains shared logics and // utilities for C2's loop auto-vectorization. @@ -170,7 +171,81 @@ class VSharedData : public StackObj { } }; -// TODO submodules +// Submodule of VLoopAnalyzer. +// Identify and mark all reductions in the loop. +class VLoopReductions : public StackObj { +private: + typedef const Pair PathEnd; + + const VLoop& _vloop; + VectorSet _loop_reductions; + +public: + VLoopReductions(Arena* arena, const VLoop& vloop) : + _vloop(vloop), + _loop_reductions(arena){}; + + NONCOPYABLE(VLoopReductions); + +private: + const VLoop& vloop() const { return _vloop; } + + // Search for a path P = (n_1, n_2, ..., n_k) such that: + // - original_input(n_i, input) = n_i+1 for all 1 <= i < k, + // - path(n) for all n in P, + // - k <= max, and + // - there exists a node e such that original_input(n_k, input) = e and end(e). + // Return , if P is found, or otherwise. + // Note that original_input(n, i) has the same behavior as n->in(i) except + // that it commutes the inputs of binary nodes whose edges have been swapped. + template + static PathEnd find_in_path(const Node* n1, uint input, int max, + NodePredicate1 path, NodePredicate2 end) { + const PathEnd no_path(nullptr, -1); + const Node* current = n1; + int k = 0; + for (int i = 0; i <= max; i++) { + if (current == nullptr) { + return no_path; + } + if (end(current)) { + return PathEnd(current, k); + } + if (!path(current)) { + return no_path; + } + current = original_input(current, input); + k++; + } + return no_path; + } + +public: + // Find and mark reductions in a loop. Running mark_reductions() is similar to + // querying is_reduction(n) for every node in the loop, but stricter in + // that it assumes counted loops and requires that reduction nodes are not + // used within the loop except by their reduction cycle predecessors. + void mark_reductions(); + // Whether n is a reduction operator and part of a reduction cycle. + // This function can be used for individual queries outside auto-vectorization, + // e.g. to inform matching in target-specific code. Otherwise, the + // almost-equivalent but faster mark_reductions() is preferable. + static bool is_reduction(const Node* n); + // Whether n is marked as a reduction node. + bool is_marked_reduction(const Node* n) const { return _loop_reductions.test(n->_idx); } + bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); } + // Are s1 and s2 reductions with a data path between them? + bool is_marked_reduction_pair(Node* s1, Node* s2) const; +private: + // Whether n is a standard reduction operator. + static bool is_reduction_operator(const Node* n); + // Whether n is part of a reduction cycle via the 'input' edge index. To bound + // the search, constrain the size of reduction cycles to LoopMaxUnroll. + static bool in_reduction_cycle(const Node* n, uint input); + // Reference to the i'th input node of n, commuting the inputs of binary nodes + // whose edges have been swapped. Assumes n is a commutative operation. + static Node* original_input(const Node* n, uint i); +}; // Analyze the loop in preparation for auto-vectorization. This class is // deliberately structured into many submodules, which are as independent @@ -193,12 +268,14 @@ class VLoopAnalyzer : StackObj { // Submodules // TODO + VLoopReductions _reductions; public: VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) : _vloop(vloop), _arena(mtCompiler), - _success(false) + _success(false), + _reductions (&_arena, vloop) // TODO modules { _success = setup_submodules(); @@ -207,8 +284,11 @@ class VLoopAnalyzer : StackObj { bool success() const { return _success; } + Arena* arena() { return &_arena; } + // Read-only accessors for submodules const VLoop& vloop() const { return _vloop; } + const VLoopReductions& reductions() const { return _reductions; } // TODO private: From 760b3798dc5c02acad5720a505ac22472010385b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 20:46:28 +0100 Subject: [PATCH 03/13] VLoopMemorySlices --- src/hotspot/share/opto/superword.cpp | 115 +++++++++--------- src/hotspot/share/opto/superword.hpp | 28 ++--- .../share/opto/traceAutoVectorizationTag.hpp | 4 +- src/hotspot/share/opto/vectorization.cpp | 18 +-- src/hotspot/share/opto/vectorization.hpp | 53 +++++++- 5 files changed, 128 insertions(+), 90 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 3e7aeb2432a7b..a0d56f08a8124 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -44,13 +44,10 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared) _packset(arena(), 8, 0, nullptr), // packs for the current block _bb_idx(vshared.node_idx_to_loop_body_idx()), // node idx to index in bb _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block - _mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads - _mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node _clone_map(phase()->C->clone_map()), // map of nodes created in cloning _align_to_ref(nullptr), // memory reference to align vectors to _dg(arena()), // dependence graph - _nlist(arena(), vloop().estimated_body_length(), 0, nullptr), // scratch list of nodes _race_possible(false), // cases where SDMU is true _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style _num_work_vecs(0), // amount of vector work we have @@ -456,9 +453,6 @@ bool SuperWord::SLP_extract() { // TODO remove all the VLoopAnalyzer stuff - // Find memory slices - find_memory_slices(); - // Ready the block if (!construct_bb()) { #ifndef PRODUCT @@ -785,16 +779,22 @@ void SuperWord::dependence_graph() { } } + const GrowableArray &mem_slice_head = vloop_analyzer().memory_slices().heads(); + const GrowableArray &mem_slice_tail = vloop_analyzer().memory_slices().tails(); + + ResourceMark rm; + GrowableArray slice_nodes; + // For each memory slice, create the dependences - for (int i = 0; i < _mem_slice_head.length(); i++) { - Node* n = _mem_slice_head.at(i); - Node* n_tail = _mem_slice_tail.at(i); + for (int i = 0; i < mem_slice_head.length(); i++) { + PhiNode* head = mem_slice_head.at(i); + MemNode* tail = mem_slice_tail.at(i); // Get slice in predecessor order (last is first) - mem_slice_preds(n_tail, n, _nlist); + vloop_analyzer().memory_slices().get_slice(head, tail, slice_nodes); // Make the slice dependent on the root - DepMem* slice = _dg.dep(n); + DepMem* slice = _dg.dep(head); _dg.make_edge(_dg.root(), slice); // Create a sink for the slice @@ -802,8 +802,8 @@ void SuperWord::dependence_graph() { _dg.make_edge(slice_sink, _dg.tail()); // Now visit each pair of memory ops, creating the edges - for (int j = _nlist.length() - 1; j >= 0 ; j--) { - Node* s1 = _nlist.at(j); + for (int j = slice_nodes.length() - 1; j >= 0 ; j--) { + Node* s1 = slice_nodes.at(j); // If no dependency yet, use slice if (_dg.dep(s1)->in_cnt() == 0) { @@ -812,7 +812,7 @@ void SuperWord::dependence_graph() { VPointer p1(s1->as_Mem(), vloop()); bool sink_dependent = true; for (int k = j - 1; k >= 0; k--) { - Node* s2 = _nlist.at(k); + Node* s2 = slice_nodes.at(k); if (s1->is_Load() && s2->is_Load()) continue; VPointer p2(s2->as_Mem(), vloop()); @@ -831,68 +831,68 @@ void SuperWord::dependence_graph() { #ifndef PRODUCT if (is_trace_superword_dependence_graph()) { - tty->print_cr("\nDependence graph for slice: %d", n->_idx); - for (int q = 0; q < _nlist.length(); q++) { - _dg.print(_nlist.at(q)); + tty->print_cr("\nDependence graph for slice: %d", head->_idx); + for (int q = 0; q < slice_nodes.length(); q++) { + _dg.print(slice_nodes.at(q)); } tty->cr(); } #endif - _nlist.clear(); + slice_nodes.clear(); } } -void SuperWord::find_memory_slices() { - assert(_mem_slice_head.length() == 0, "mem_slice_head is empty"); - assert(_mem_slice_tail.length() == 0, "mem_slice_tail is empty"); +void VLoopMemorySlices::find_memory_slices() { + assert(_heads.is_empty(), "not yet computed"); + assert(_tails.is_empty(), "not yet computed"); + CountedLoopNode* cl = vloop().cl(); // Iterate over all memory phis - for (DUIterator_Fast imax, i = cl()->fast_outs(imax); i < imax; i++) { - PhiNode* phi = cl()->fast_out(i)->isa_Phi(); - if (phi != nullptr && in_bb(phi) && phi->is_memory_phi()) { + for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) { + PhiNode* phi = cl->fast_out(i)->isa_Phi(); + if (phi != nullptr && vloop().in_bb(phi) && phi->is_memory_phi()) { Node* phi_tail = phi->in(LoopNode::LoopBackControl); if (phi_tail != phi->in(LoopNode::EntryControl)) { - _mem_slice_head.push(phi); - _mem_slice_tail.push(phi_tail->as_Mem()); + _heads.push(phi); + _tails.push(phi_tail->as_Mem()); } } } - NOT_PRODUCT( if (is_trace_superword_memory_slices()) { print_memory_slices(); } ) + NOT_PRODUCT( if (vloop().is_trace_memory_slices()) { print(); } ) } #ifndef PRODUCT -void SuperWord::print_memory_slices() { - tty->print_cr("\nSuperWord::print_memory_slices: %s", - _mem_slice_head.length() > 0 ? "" : "NONE"); - for (int m = 0; m < _mem_slice_head.length(); m++) { - tty->print("%6d ", m); _mem_slice_head.at(m)->dump(); - tty->print(" "); _mem_slice_tail.at(m)->dump(); +void VLoopMemorySlices::print() const { + tty->print_cr("\nVLoopMemorySlices::print: %s", + heads().length() > 0 ? "" : "NONE"); + for (int m = 0; m < heads().length(); m++) { + tty->print("%6d ", m); heads().at(m)->dump(); + tty->print(" "); tails().at(m)->dump(); } } #endif -//---------------------------mem_slice_preds--------------------------- -// Return a memory slice (node list) in predecessor order starting at "start" -void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray &preds) { - assert(preds.length() == 0, "start empty"); - Node* n = start; +// Get all memory nodes of a slice, in reverse order +void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray &slice) const { + assert(slice.length() == 0, "start empty"); + Node* n = tail; Node* prev = nullptr; while (true) { - assert(in_bb(n), "must be in block"); + assert(vloop().in_bb(n), "must be in block"); for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { Node* out = n->fast_out(i); if (out->is_Load()) { - if (in_bb(out)) { - preds.push(out); + if (vloop().in_bb(out)) { + slice.push(out); } } else { // FIXME - if (out->is_MergeMem() && !in_bb(out)) { + if (out->is_MergeMem() && !vloop().in_bb(out)) { // Either unrolling is causing a memory edge not to disappear, // or need to run igvn.optimize() again before SLP - } else if (out->is_memory_phi() && !in_bb(out)) { + } else if (out->is_memory_phi() && !vloop().in_bb(out)) { // Ditto. Not sure what else to check further. } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) { // StoreCM has an input edge used as a precedence edge. @@ -902,19 +902,19 @@ void SuperWord::mem_slice_preds(Node* start, Node* stop, GrowableArray &p } }//else }//for - if (n == stop) break; - preds.push(n); + if (n == head) { break; } + slice.push(n); prev = n; assert(n->is_Mem(), "unexpected node %s", n->Name()); n = n->in(MemNode::Memory); } #ifndef PRODUCT - if (is_trace_superword_memory_slices()) { - tty->print_cr("\nSuperWord::mem_slice_preds:"); - stop->dump(); - for (int j = preds.length() - 1; j >= 0 ; j--) { - preds.at(j)->dump(); + if (vloop().is_trace_memory_slices()) { + tty->print_cr("\nVLoopMemorySlices::get_slice:"); + head->dump(); + for (int j = slice.length() - 1; j >= 0 ; j--) { + slice.at(j)->dump(); } } #endif @@ -2262,9 +2262,11 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) { // loop we may have a different last store, and we need to adjust the uses accordingly. GrowableArray old_last_store_in_slice(max_slices, max_slices, nullptr); + const GrowableArray &mem_slice_head = vloop_analyzer().memory_slices().heads(); + // (1) Set up the initial memory state from Phi. And find the old last store. - for (int i = 0; i < _mem_slice_head.length(); i++) { - Node* phi = _mem_slice_head.at(i); + for (int i = 0; i < mem_slice_head.length(); i++) { + Node* phi = mem_slice_head.at(i); assert(phi->is_Phi(), "must be phi"); int alias_idx = phase()->C->get_alias_index(phi->adr_type()); current_state_in_slice.at_put(alias_idx, phi); @@ -2299,8 +2301,8 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) { // in the Phi. Further, we replace uses of the old last store // with uses of the new last store (current_state). Node_List uses_after_loop; - for (int i = 0; i < _mem_slice_head.length(); i++) { - Node* phi = _mem_slice_head.at(i); + for (int i = 0; i < mem_slice_head.length(); i++) { + Node* phi = mem_slice_head.at(i); int alias_idx = phase()->C->get_alias_index(phi->adr_type()); Node* current_state = current_state_in_slice.at(alias_idx); assert(current_state != nullptr, "slice is mapped"); @@ -3274,8 +3276,9 @@ bool SuperWord::same_velt_type(Node* n1, Node* n2) { return vt1 == vt2; } -bool SuperWord::same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const { - return phase()->C->get_alias_index(mem_ref->adr_type()) == phase()->C->get_alias_index(best_align_to_mem_ref->adr_type()); +bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const { + return vloop().phase()->C->get_alias_index(m1->adr_type()) == + vloop().phase()->C->get_alias_index(m2->adr_type()); } //------------------------------in_packset--------------------------- diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 06ba6653d2766..db890e5539250 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -211,17 +211,12 @@ class SuperWord : public ResourceObj { GrowableArray &_bb_idx; // Map from Node _idx to index within block GrowableArray _block; // Nodes in current block - GrowableArray _mem_slice_head; // Memory slice head nodes - GrowableArray _mem_slice_tail; // Memory slice tail nodes GrowableArray _node_info; // Info needed per node CloneMap& _clone_map; // map of nodes created in cloning MemNode const* _align_to_ref; // Memory reference that pre-loop will align to DepGraph _dg; // Dependence graph - // Scratch pads - GrowableArray _nlist; // List of nodes - public: SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared); @@ -248,8 +243,14 @@ class SuperWord : public ResourceObj { bool is_marked_reduction(const Node* n) const { return vloop_analyzer().reductions().is_marked_reduction(n); } - bool reduction(Node* s1, Node* s2) const { - return vloop_analyzer().reductions().is_marked_reduction_pair(s1, s2); + + bool reduction(Node* n1, Node* n2) const { + return vloop_analyzer().reductions().is_marked_reduction_pair(n1, n2); + } + + // VLoopMemorySlices Accessors + bool same_memory_slice(MemNode* n1, MemNode* n2) const { + return vloop_analyzer().memory_slices().same_memory_slice(n1, n2); } #ifndef PRODUCT @@ -264,11 +265,6 @@ class SuperWord : public ResourceObj { return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT); } - bool is_trace_superword_memory_slices() const { - return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES); - } - bool is_trace_superword_dependence_graph() const { return TraceSuperWord || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH); @@ -304,7 +300,6 @@ class SuperWord : public ResourceObj { is_trace_align_vector() || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_MEMORY_SLICES) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) || @@ -370,7 +365,6 @@ class SuperWord : public ResourceObj { BasicType velt_basic_type(const Node* n) const { return velt_type(n)->array_element_basic_type(); } void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; } bool same_velt_type(Node* n1, Node* n2); - bool same_memory_slice(MemNode* best_align_to_mem_ref, MemNode* mem_ref) const; // my_pack public: @@ -396,12 +390,6 @@ class SuperWord : public ResourceObj { // Construct dependency graph. void dependence_graph(); - // Analyze the memory slices - void find_memory_slices(); - NOT_PRODUCT( void print_memory_slices(); ) - // Return a memory slice (node list) in predecessor order starting at "start" - void mem_slice_preds(Node* start, Node* stop, GrowableArray &preds); - // Can s1 and s2 be in a pack with s1 immediately preceding s2 and s1 aligned at "align" bool stmts_can_pack(Node* s1, Node* s2, int align); // Does s exist in a pack at position pos? diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 78f1301010aae..c7576e7343dfd 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -32,9 +32,9 @@ flags(POINTER_ANALYSIS, "Trace VPointer") \ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ + flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ - flags(SW_MEMORY_SLICES, "Trace SuperWord memory slices") \ flags(SW_DEPENDENCE_GRAPH, "Trace SuperWord::dependence_graph") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \ flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ @@ -115,7 +115,6 @@ class TraceAutoVectorizationTagValidator { } else if (SW_VERBOSE == tag) { _tags.at_put(SW_TYPES, set_bit); _tags.at_put(SW_ALIGNMENT, set_bit); - _tags.at_put(SW_MEMORY_SLICES, set_bit); _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit); _tags.at_put(SW_ADJACENT_MEMOPS, set_bit); _tags.at_put(SW_REJECTIONS, set_bit); @@ -123,7 +122,6 @@ class TraceAutoVectorizationTagValidator { _tags.at_put(SW_INFO, set_bit); _tags.at_put(SW_VERBOSE, set_bit); } else if (SW_INFO == tag) { - _tags.at_put(SW_MEMORY_SLICES, set_bit); _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit); _tags.at_put(SW_ADJACENT_MEMOPS, set_bit); _tags.at_put(SW_REJECTIONS, set_bit); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index a1369da7f4df3..56350b3a0a309 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -148,15 +148,15 @@ const char* VLoopAnalyzer::setup_submodules_helper() { _reductions.mark_reductions(); } - // TODO _memory_slices.analyze(); - - // // If there is no memory slice detected, that means there is no store. - // // If there is no reduction and no store, then we give up, because - // // vectorization is not possible anyway (given current limitations). - // if (!reductions().is_marked_reduction_loop() && - // _memory_slices.heads().is_empty()) { - // return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE; - // } + _memory_slices.find_memory_slices(); + + // If there is no memory slice detected, that means there is no store. + // If there is no reduction and no store, then we give up, because + // vectorization is not possible anyway (given current limitations). + if (!reductions().is_marked_reduction_loop() && + _memory_slices.heads().is_empty()) { + return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE; + } // TODO return VLoopAnalyzer::SUCCESS; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 30c386465fd9f..1ade6011b2e1d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -116,6 +116,10 @@ class VLoop : public StackObj { return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER); } + bool is_trace_memory_slices() const { + return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES); + } + bool is_trace_pointer_analysis() const { return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -189,7 +193,6 @@ class VLoopReductions : public StackObj { private: const VLoop& vloop() const { return _vloop; } - // Search for a path P = (n_1, n_2, ..., n_k) such that: // - original_input(n_i, input) = n_i+1 for all 1 <= i < k, // - path(n) for all n in P, @@ -236,6 +239,7 @@ class VLoopReductions : public StackObj { bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); } // Are s1 and s2 reductions with a data path between them? bool is_marked_reduction_pair(Node* s1, Node* s2) const; + private: // Whether n is a standard reduction operator. static bool is_reduction_operator(const Node* n); @@ -247,6 +251,39 @@ class VLoopReductions : public StackObj { static Node* original_input(const Node* n, uint i); }; +// Submodule of VLoopAnalyzer. +// Find the memory slices in the loop. +class VLoopMemorySlices : public StackObj { +private: + const VLoop& _vloop; + + GrowableArray _heads; + GrowableArray _tails; + + const VLoop& vloop() const { return _vloop; } + +public: + VLoopMemorySlices(Arena* arena, const VLoop& vloop) : + _vloop(vloop), + _heads(arena, 8, 0, nullptr), + _tails(arena, 8, 0, nullptr) {}; + NONCOPYABLE(VLoopMemorySlices); + + void find_memory_slices(); + + const GrowableArray &heads() const { return _heads; } + const GrowableArray &tails() const { return _tails; } + + // Get all memory nodes of a slice, in reverse order + void get_slice(PhiNode* head, MemNode* tail, GrowableArray &slice) const; + + bool same_memory_slice(MemNode* m1, MemNode* m2) const; + +#ifndef PRODUCT + void print() const; +#endif +}; + // Analyze the loop in preparation for auto-vectorization. This class is // deliberately structured into many submodules, which are as independent // as possible, though some submodules do require other submodules. @@ -269,13 +306,21 @@ class VLoopAnalyzer : StackObj { // Submodules // TODO VLoopReductions _reductions; + VLoopMemorySlices _memory_slices; + //VLoopBody _body; + //VLoopTypes _types; + //VLoopDependenceGraph _dependence_graph; public: VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) : _vloop(vloop), _arena(mtCompiler), _success(false), - _reductions (&_arena, vloop) + _reductions (&_arena, vloop), + _memory_slices (&_arena, vloop) + //_body (&_arena, vloop), + //_types (&_arena, vloop, body()), + //_dependence_graph(&_arena, vloop, memory_slices(), body()) // TODO modules { _success = setup_submodules(); @@ -289,6 +334,10 @@ class VLoopAnalyzer : StackObj { // Read-only accessors for submodules const VLoop& vloop() const { return _vloop; } const VLoopReductions& reductions() const { return _reductions; } + const VLoopMemorySlices& memory_slices() const { return _memory_slices; } + //const VLoopBody& body() const { return _body; } + //const VLoopTypes& types() const { return _types; } + //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; } // TODO private: From 8e4377409465c2b62c358071bf37390c787fc822 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 22:03:16 +0100 Subject: [PATCH 04/13] VLoopBody --- src/hotspot/share/opto/loopopts.cpp | 2 +- src/hotspot/share/opto/superword.cpp | 144 ++++++++---------- src/hotspot/share/opto/superword.hpp | 24 ++- .../share/opto/traceAutoVectorizationTag.hpp | 1 + src/hotspot/share/opto/vectorization.cpp | 5 + src/hotspot/share/opto/vectorization.hpp | 54 ++++++- 6 files changed, 131 insertions(+), 99 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 406158eaee42f..ec16053e6bd45 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4237,7 +4237,7 @@ PhaseIdealLoop::auto_vectorize(IdealLoopTree* lpt, VSharedData &vshared) { return AutoVectorizeStatus::TriedAndFailed; } - SuperWord sw(vloop_analyzer, vshared); + SuperWord sw(vloop_analyzer); if (!sw.transform_loop()) { return AutoVectorizeStatus::TriedAndFailed; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index a0d56f08a8124..15d13358c9be7 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -38,12 +38,10 @@ #include "opto/movenode.hpp" #include "utilities/powerOfTwo.hpp" -SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared) : +SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : _vloop_analyzer(vloop_analyzer), _arena(mtCompiler), _packset(arena(), 8, 0, nullptr), // packs for the current block - _bb_idx(vshared.node_idx_to_loop_body_idx()), // node idx to index in bb - _block(arena(), vloop().estimated_body_length(), 0, nullptr), // nodes in current block _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node _clone_map(phase()->C->clone_map()), // map of nodes created in cloning _align_to_ref(nullptr), // memory reference to align vectors to @@ -453,17 +451,6 @@ bool SuperWord::SLP_extract() { // TODO remove all the VLoopAnalyzer stuff - // Ready the block - if (!construct_bb()) { -#ifndef PRODUCT - if (is_trace_superword_any()) { - tty->print_cr("\nSuperWord::construct_bb failed: abort SuperWord"); - tty->cr(); - } -#endif - return false; - } - // Ensure extra info is allocated. initialize_node_info(); @@ -514,8 +501,8 @@ bool SuperWord::SLP_extract() { void SuperWord::find_adjacent_refs() { // Get list of memory operations Node_List memops; - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); if (n->is_Mem() && !n->is_LoadStore() && in_bb(n) && is_java_primitive(n->as_Mem()->memory_type())) { int align = memory_alignment(n->as_Mem(), 0); @@ -772,8 +759,8 @@ void SuperWord::dependence_graph() { assert(cl->is_main_loop(), "SLP should only work on main loops"); // First, assign a dependence node to each memory node - for (int i = 0; i < _block.length(); i++ ) { - Node *n = _block.at(i); + for (int i = 0; i < body().length(); i++ ) { + Node *n = body().at(i); if (n->is_Mem() || n->is_memory_phi()) { _dg.make_node(n); } @@ -1961,8 +1948,8 @@ void SuperWord::verify_packs() { } // Check that no other node has my_pack set. - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); if (!processed.member(n)) { assert(my_pack(n) == nullptr, "should not have pack if not in packset"); } @@ -2045,9 +2032,9 @@ class PacksetGraph { // Create nodes (from packs and scalar-nodes), and add edges, based on DepPreds. void build() { - const GrowableArray &packset = _slp->packset(); - const GrowableArray &block = _slp->block(); - const DepGraph &dg = _slp->dg(); + const GrowableArray& packset = _slp->packset(); + const GrowableArray& body = _slp->body(); + const DepGraph& dg = _slp->dg(); // Map nodes in packsets for (int i = 0; i < packset.length(); i++) { Node_List* p = packset.at(i); @@ -2062,8 +2049,8 @@ class PacksetGraph { int max_pid_packset = _max_pid; // Map nodes not in packset - for (int i = 0; i < block.length(); i++) { - Node* n = block.at(i); + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); if (n->is_Phi() || n->is_CFG()) { continue; // ignore control flow } @@ -2090,7 +2077,7 @@ class PacksetGraph { if (pred_pid == pid && _slp->is_marked_reduction(n)) { continue; // reduction -> self-cycle is not a cyclic dependency } - // Only add edges once, and only for mapped nodes (in block) + // Only add edges once, and only for mapped nodes (in body) if (pred_pid > 0 && !set.test_set(pred_pid)) { incnt_set(pid, incnt(pid) + 1); // increment out(pred_pid).push(pid); @@ -2100,8 +2087,8 @@ class PacksetGraph { } // Map edges for nodes not in packset - for (int i = 0; i < block.length(); i++) { - Node* n = block.at(i); + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); int pid = get_pid_or_zero(n); // zero for Phi or CFG if (pid <= max_pid_packset) { continue; // Only scalar-nodes @@ -2109,7 +2096,7 @@ class PacksetGraph { for (DepPreds preds(n, dg); !preds.done(); preds.next()) { Node* pred = preds.current(); int pred_pid = get_pid_or_zero(pred); - // Only add edges for mapped nodes (in block) + // Only add edges for mapped nodes (in body) if (pred_pid > 0) { incnt_set(pid, incnt(pid) + 1); // increment out(pred_pid).push(pid); @@ -2170,7 +2157,7 @@ class PacksetGraph { // print_nodes = true: print all C2 nodes beloning to PacksetGrahp node. // print_zero_incnt = false: do not print nodes that have no in-edges (any more). void print(bool print_nodes, bool print_zero_incnt) { - const GrowableArray &block = _slp->block(); + const GrowableArray &body = _slp->body(); tty->print_cr("PacksetGraph"); for (int pid = 1; pid <= _max_pid; pid++) { if (incnt(pid) == 0 && !print_zero_incnt) { @@ -2183,8 +2170,8 @@ class PacksetGraph { tty->print_cr("]"); #ifndef PRODUCT if (print_nodes) { - for (int i = 0; i < block.length(); i++) { - Node* n = block.at(i); + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); if (get_pid_or_zero(n) == pid) { tty->print(" "); n->dump(); @@ -2364,8 +2351,8 @@ bool SuperWord::output() { uint max_vlen_in_bytes = 0; uint max_vlen = 0; - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); Node_List* p = my_pack(n); if (p != nullptr && n == p->at(p->size()-1)) { // After schedule_reorder_memops, we know that the memops have the same order in the pack @@ -2637,7 +2624,6 @@ bool SuperWord::output() { } #endif - _block.at_put(i, vn); igvn().register_new_node_with_optimizer(vn); phase()->set_ctrl(vn, phase()->get_ctrl(first)); for (uint j = 0; j < p->size(); j++) { @@ -2654,7 +2640,7 @@ bool SuperWord::output() { } VectorNode::trace_new_vector(vn, "SuperWord"); } - }//for (int i = 0; i < _block.length(); i++) + }//for (int i = 0; i < body().length(); i++) if (max_vlen_in_bytes > C->max_vector_size()) { C->set_max_vector_size(max_vlen_in_bytes); @@ -2918,33 +2904,32 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) { return true; } -//------------------------------construct_bb--------------------------- -// Construct reverse postorder list of block members -bool SuperWord::construct_bb() { - assert(_block.length() == 0, "block is empty"); +// Return nullptr if success, else failure message +const char* VLoopBody::construct() { + assert(_body.length() == 0, "body is empty"); // First pass over loop body: // (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj). // (2) Count number of nodes, and create a temporary map (_idx -> bb_idx). // (3) Verify that all non-ctrl nodes have an input inside the loop. - int block_count = 0; - for (uint i = 0; i < lpt()->_body.size(); i++) { - Node* n = lpt()->_body.at(i); + int body_count = 0; + for (uint i = 0; i < vloop().lpt()->_body.size(); i++) { + Node* n = vloop().lpt()->_body.at(i); set_bb_idx(n, i); // Create a temporary map - if (in_bb(n)) { - block_count++; + if (vloop().in_bb(n)) { + body_count++; if (n->is_LoadStore() || n->is_MergeMem() || (n->is_Proj() && !n->as_Proj()->is_CFG())) { // Bailout if the loop has LoadStore, MergeMem or data Proj // nodes. Superword optimization does not work with them. #ifndef PRODUCT - if (is_trace_superword_any()) { - tty->print_cr("SuperWord::construct_bb: fails because of unhandled node:"); + if (vloop().is_trace_body()) { + tty->print_cr("VLoopBody::construct: fails because of unhandled node:"); n->dump(); } #endif - return false; + return VLoopBody::FAILURE_NODE_NOT_ALLOWED; } #ifdef ASSERT @@ -2952,7 +2937,7 @@ bool SuperWord::construct_bb() { bool found = false; for (uint j = 0; j < n->req(); j++) { Node* def = n->in(j); - if (def != nullptr && in_bb(def)) { + if (def != nullptr && vloop().in_bb(def)) { found = true; break; } @@ -2963,17 +2948,17 @@ bool SuperWord::construct_bb() { } } - // Create a reverse-post-order list of nodes in block + // Create a reverse-post-order list of nodes in body ResourceMark rm; GrowableArray stack; VectorSet visited; VectorSet post_visited; - visited.set(bb_idx(cl())); - stack.push(cl()); + visited.set(bb_idx(vloop().cl())); + stack.push(vloop().cl()); // Do a depth first walk over out edges - int rpo_idx = block_count - 1; + int rpo_idx = body_count - 1; while (!stack.is_empty()) { Node* n = stack.top(); // Leave node on stack if (!visited.test_set(bb_idx(n))) { @@ -2983,9 +2968,9 @@ bool SuperWord::construct_bb() { const int old_length = stack.length(); for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { Node* use = n->fast_out(i); - if (in_bb(use) && !visited.test(bb_idx(use)) && + if (vloop().in_bb(use) && !visited.test(bb_idx(use)) && // Don't go around backedge - (!use->is_Phi() || n == cl())) { + (!use->is_Phi() || n == vloop().cl())) { stack.push(use); } } @@ -2993,7 +2978,7 @@ bool SuperWord::construct_bb() { // There were no additional uses, post visit node now stack.pop(); // Remove node from stack assert(rpo_idx >= 0, "must still have idx to pass out"); - _block.at_put_grow(rpo_idx, n); + _body.at_put_grow(rpo_idx, n); rpo_idx--; post_visited.set(bb_idx(n)); assert(rpo_idx >= 0 || stack.is_empty(), "still have idx left or are finished"); @@ -3003,25 +2988,25 @@ bool SuperWord::construct_bb() { } } - // Create real map of block indices for nodes - for (int j = 0; j < _block.length(); j++) { - Node* n = _block.at(j); + // Create real map of body indices for nodes + for (int j = 0; j < _body.length(); j++) { + Node* n = _body.at(j); set_bb_idx(n, j); } #ifndef PRODUCT - if (is_trace_superword_info()) { - print_bb(); + if (vloop().is_trace_body()) { + print(); } #endif - assert(rpo_idx == -1 && block_count == _block.length(), "all block members found"); - return true; + assert(rpo_idx == -1 && body_count == _body.length(), "all body members found"); + return nullptr; // success } // Initialize per node info void SuperWord::initialize_node_info() { - Node* last = _block.at(_block.length() - 1); + Node* last = body().at(body().length() - 1); grow_node_info(bb_idx(last)); } @@ -3033,8 +3018,8 @@ void SuperWord::compute_max_depth() { bool again; do { again = false; - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); if (!n->is_Phi()) { int d_orig = depth(n); int d_in = 0; @@ -3124,15 +3109,15 @@ void SuperWord::compute_vector_element_type() { #endif // Initial type - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); set_velt_type(n, container_type(n)); } // Propagate integer narrowed type backwards through operations // that don't depend on higher order bits - for (int i = _block.length() - 1; i >= 0; i--) { - Node* n = _block.at(i); + for (int i = body().length() - 1; i >= 0; i--) { + Node* n = body().at(i); // Only integer types need be examined const Type* vtn = velt_type(n); if (vtn->basic_type() == T_INT) { @@ -3180,8 +3165,8 @@ void SuperWord::compute_vector_element_type() { } } } - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); Node* nn = n; if (nn->is_Bool() && nn->in(0) == nullptr) { nn = nn->in(1); @@ -3198,8 +3183,8 @@ void SuperWord::compute_vector_element_type() { } #ifndef PRODUCT if (is_trace_superword_vector_element_type()) { - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); velt_type(n)->dump(); tty->print("\t"); n->dump(); @@ -3699,19 +3684,18 @@ void SuperWord::print_pack(Node_List* p) { } } -//------------------------------print_bb--------------------------- -void SuperWord::print_bb() { #ifndef PRODUCT +void VLoopBody::print() const { tty->print_cr("\nBlock"); - for (int i = 0; i < _block.length(); i++) { - Node* n = _block.at(i); + for (int i = 0; i < body().length(); i++) { + Node* n = body().at(i); tty->print("%d ", i); - if (n) { + if (n != nullptr) { n->dump(); } } -#endif } +#endif //------------------------------print_stmt--------------------------- void SuperWord::print_stmt(Node* s) { diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index db890e5539250..fa8563112bba5 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -208,9 +208,6 @@ class SuperWord : public ResourceObj { GrowableArray _packset; // Packs for the current block - GrowableArray &_bb_idx; // Map from Node _idx to index within block - - GrowableArray _block; // Nodes in current block GrowableArray _node_info; // Info needed per node CloneMap& _clone_map; // map of nodes created in cloning MemNode const* _align_to_ref; // Memory reference that pre-loop will align to @@ -218,7 +215,7 @@ class SuperWord : public ResourceObj { DepGraph _dg; // Dependence graph public: - SuperWord(const VLoopAnalyzer &vloop_analyzer, VSharedData &vshared); + SuperWord(const VLoopAnalyzer &vloop_analyzer); // Attempt to run the SuperWord algorithm on the loop. Return true if we succeed. bool transform_loop(); @@ -253,6 +250,15 @@ class SuperWord : public ResourceObj { return vloop_analyzer().memory_slices().same_memory_slice(n1, n2); } + // VLoopAnalyzer body + const GrowableArray& body() const { + return vloop_analyzer().body().body(); + } + + int bb_idx(const Node* n) const { + return vloop_analyzer().body().bb_idx(n); + } + #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord bool is_trace_superword_vector_element_type() const { @@ -317,7 +323,6 @@ class SuperWord : public ResourceObj { bool do_vector_loop() { return _do_vector_loop; } const GrowableArray& packset() const { return _packset; } - const GrowableArray& block() const { return _block; } const DepGraph& dg() const { return _dg; } private: bool _race_possible; // In cases where SDMU is true @@ -340,12 +345,6 @@ class SuperWord : public ResourceObj { const MemNode* align_to_ref() const { return _align_to_ref; } void set_align_to_ref(const MemNode* m) { _align_to_ref = m; } - // block accessors - public: - int bb_idx(const Node* n) const { assert(in_bb(n), "must be"); return _bb_idx.at(n->_idx); } - private: - void set_bb_idx(Node* n, int i) { _bb_idx.at_put_grow(n->_idx, i); } - // Ensure node_info contains element "i" void grow_node_info(int i) { if (i >= _node_info.length()) _node_info.at_put_grow(i, SWNodeInfo::initial); } @@ -453,8 +452,6 @@ class SuperWord : public ResourceObj { DEBUG_ONLY(void verify_no_extract();) // Is use->in(u_idx) a vector use? bool is_vector_use(Node* use, int u_idx); - // Construct reverse postorder list of block members - bool construct_bb(); // Initialize per node info void initialize_node_info(); // Compute max depth for expressions from beginning of block @@ -482,7 +479,6 @@ class SuperWord : public ResourceObj { // print methods void print_packset(); void print_pack(Node_List* p); - void print_bb(); void print_stmt(Node* s); void packset_sort(int n); diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index c7576e7343dfd..5121634285b90 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -33,6 +33,7 @@ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ + flags(BODY, "Trace VLoopBody") \ flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ flags(SW_DEPENDENCE_GRAPH, "Trace SuperWord::dependence_graph") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 56350b3a0a309..d90dea194ecf8 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -158,6 +158,11 @@ const char* VLoopAnalyzer::setup_submodules_helper() { return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE; } + const char* body_failure = _body.construct(); + if (body_failure != nullptr) { + return body_failure; + } + // TODO return VLoopAnalyzer::SUCCESS; } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 1ade6011b2e1d..0ae2bb7a12da9 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -120,6 +120,10 @@ class VLoop : public StackObj { return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES); } + bool is_trace_body() const { + return vtrace().is_trace(TraceAutoVectorizationTag::BODY); + } + bool is_trace_pointer_analysis() const { return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -284,6 +288,48 @@ class VLoopMemorySlices : public StackObj { #endif }; +// Submodule of VLoopAnalyzer. +// Finds all nodes in the body, and creates a mapping node->_idx to a body_idx. +// This mapping is used so that subsequent datastructures sizes only grow with +// the body size, and not the number of all nodes in the compilation. +class VLoopBody : public StackObj { +private: + static constexpr char const* FAILURE_NODE_NOT_ALLOWED = "encontered unhandled node"; + + const VLoop& _vloop; + + // Mapping body_idx -> Node* + GrowableArray _body; + + // Mapping node->_idx -> body_idx + // Can be very large, and thus lives in VSharedData + GrowableArray& _body_idx; + + const VLoop& vloop() const { return _vloop; } + +public: + VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) : + _vloop(vloop), + _body(arena, vloop.estimated_body_length(), 0, nullptr), + _body_idx(vshared.node_idx_to_loop_body_idx()) {} + + NONCOPYABLE(VLoopBody); + + const char* construct(); + const GrowableArray& body() const { return _body; } + NOT_PRODUCT( void print() const; ) + + int bb_idx(const Node* n) const { + assert(_vloop.in_bb(n), "must be in basic block"); + return _body_idx.at(n->_idx); + } + +private: + void set_bb_idx(Node* n, int i) { + _body_idx.at_put_grow(n->_idx, i); + } +}; + // Analyze the loop in preparation for auto-vectorization. This class is // deliberately structured into many submodules, which are as independent // as possible, though some submodules do require other submodules. @@ -307,7 +353,7 @@ class VLoopAnalyzer : StackObj { // TODO VLoopReductions _reductions; VLoopMemorySlices _memory_slices; - //VLoopBody _body; + VLoopBody _body; //VLoopTypes _types; //VLoopDependenceGraph _dependence_graph; @@ -317,8 +363,8 @@ class VLoopAnalyzer : StackObj { _arena(mtCompiler), _success(false), _reductions (&_arena, vloop), - _memory_slices (&_arena, vloop) - //_body (&_arena, vloop), + _memory_slices (&_arena, vloop), + _body (&_arena, vloop, vshared) //_types (&_arena, vloop, body()), //_dependence_graph(&_arena, vloop, memory_slices(), body()) // TODO modules @@ -335,7 +381,7 @@ class VLoopAnalyzer : StackObj { const VLoop& vloop() const { return _vloop; } const VLoopReductions& reductions() const { return _reductions; } const VLoopMemorySlices& memory_slices() const { return _memory_slices; } - //const VLoopBody& body() const { return _body; } + const VLoopBody& body() const { return _body; } //const VLoopTypes& types() const { return _types; } //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; } // TODO From 3cf41a5a8d4487dc1d90f9e2fc8a127f3c3cce96 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 22:50:04 +0100 Subject: [PATCH 05/13] VLoopTypes --- src/hotspot/share/opto/superword.cpp | 77 +++++-------- src/hotspot/share/opto/superword.hpp | 55 +++++---- .../share/opto/traceAutoVectorizationTag.hpp | 3 +- src/hotspot/share/opto/vectorization.cpp | 2 + src/hotspot/share/opto/vectorization.hpp | 106 ++++++++++++++++-- 5 files changed, 158 insertions(+), 85 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 15d13358c9be7..39344aee3a7b4 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -460,9 +460,6 @@ bool SuperWord::SLP_extract() { // compute function depth(Node*) compute_max_depth(); - // Compute vector element types - compute_vector_element_type(); - // Attempt vectorization find_adjacent_refs(); @@ -1120,13 +1117,6 @@ void SuperWord::set_alignment(Node* s1, Node* s2, int align) { } } -//------------------------------data_size--------------------------- -int SuperWord::data_size(Node* s) { - int bsize = type2aelembytes(velt_basic_type(s)); - assert(bsize != 0, "valid size"); - return bsize; -} - //------------------------------extend_packlist--------------------------- // Extend packset by following use->def and def->use links from pack members. void SuperWord::extend_packlist() { @@ -3094,30 +3084,29 @@ int SuperWord::max_vector_size_in_def_use_chain(Node* n) { return max < 2 ? Matcher::max_vector_size_auto_vectorization(bt) : max; } -//-------------------------compute_vector_element_type----------------------- -// Compute necessary vector element type for expressions -// This propagates backwards a narrower integer type when the -// upper bits of the value are not needed. -// Example: char a,b,c; a = b + c; -// Normally the type of the add is integer, but for packed character -// operations the type of the add needs to be char. -void SuperWord::compute_vector_element_type() { +void VLoopTypes::compute_vector_element_type() { #ifndef PRODUCT - if (is_trace_superword_vector_element_type()) { - tty->print_cr("\ncompute_velt_type:"); + if (vloop().is_trace_vector_element_type()) { + tty->print_cr("\nVLoopTypes::compute_vector_element_type:"); } #endif + const GrowableArray& body = _body.body(); + + assert(_velt_type.is_empty(), "must not yet be computed"); + // reserve space + _velt_type.at_put_grow(body.length()-1, nullptr); + // Initial type - for (int i = 0; i < body().length(); i++) { - Node* n = body().at(i); + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); set_velt_type(n, container_type(n)); } // Propagate integer narrowed type backwards through operations // that don't depend on higher order bits - for (int i = body().length() - 1; i >= 0; i--) { - Node* n = body().at(i); + for (int i = body.length() - 1; i >= 0; i--) { + Node* n = body.at(i); // Only integer types need be examined const Type* vtn = velt_type(n); if (vtn->basic_type() == T_INT) { @@ -3127,12 +3116,14 @@ void SuperWord::compute_vector_element_type() { for (uint j = start; j < end; j++) { Node* in = n->in(j); // Don't propagate through a memory - if (!in->is_Mem() && in_bb(in) && velt_type(in)->basic_type() == T_INT && + if (!in->is_Mem() && + vloop().in_bb(in) && + velt_type(in)->basic_type() == T_INT && data_size(n) < data_size(in)) { bool same_type = true; for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { Node *use = in->fast_out(k); - if (!in_bb(use) || !same_velt_type(use, n)) { + if (!vloop().in_bb(use) || !same_velt_type(use, n)) { same_type = false; break; } @@ -3149,7 +3140,9 @@ void SuperWord::compute_vector_element_type() { int op = in->Opcode(); if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) { Node* load = in->in(1); - if (load->is_Load() && in_bb(load) && (velt_type(load)->basic_type() == T_INT)) { + if (load->is_Load() && + vloop().in_bb(load) && + (velt_type(load)->basic_type() == T_INT)) { // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned // (LoadUS/LoadUB) values. Store nodes only have one version. vt = velt_type(load); @@ -3165,16 +3158,17 @@ void SuperWord::compute_vector_element_type() { } } } - for (int i = 0; i < body().length(); i++) { - Node* n = body().at(i); + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); Node* nn = n; if (nn->is_Bool() && nn->in(0) == nullptr) { nn = nn->in(1); assert(nn->is_Cmp(), "always have Cmp above Bool"); } if (nn->is_Cmp() && nn->in(0) == nullptr) { - assert(in_bb(nn->in(1)) || in_bb(nn->in(2)), "one of the inputs must be in the loop too"); - if (in_bb(nn->in(1))) { + assert(vloop().in_bb(nn->in(1)) || vloop().in_bb(nn->in(2)), + "one of the inputs must be in the loop too"); + if (vloop().in_bb(nn->in(1))) { set_velt_type(n, velt_type(nn->in(1))); } else { set_velt_type(n, velt_type(nn->in(2))); @@ -3182,9 +3176,9 @@ void SuperWord::compute_vector_element_type() { } } #ifndef PRODUCT - if (is_trace_superword_vector_element_type()) { - for (int i = 0; i < body().length(); i++) { - Node* n = body().at(i); + if (vloop().is_trace_vector_element_type()) { + for (int i = 0; i < body.length(); i++) { + Node* n = body.at(i); velt_type(n)->dump(); tty->print("\t"); n->dump(); @@ -3223,9 +3217,8 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { return off_mod; } -//---------------------------container_type--------------------------- // Smallest type containing range of values -const Type* SuperWord::container_type(Node* n) { +const Type* VLoopTypes::container_type(Node* n) const { if (n->is_Mem()) { BasicType bt = n->as_Mem()->memory_type(); if (n->is_Store() && (bt == T_CHAR)) { @@ -3242,7 +3235,7 @@ const Type* SuperWord::container_type(Node* n) { } return Type::get_const_basic_type(bt); } - const Type* t = igvn().type(n); + const Type* t = vloop().phase()->igvn().type(n); if (t->basic_type() == T_INT) { // A narrow type of arithmetic operations will be determined by // propagating the type of memory operations. @@ -3251,16 +3244,6 @@ const Type* SuperWord::container_type(Node* n) { return t; } -bool SuperWord::same_velt_type(Node* n1, Node* n2) { - const Type* vt1 = velt_type(n1); - const Type* vt2 = velt_type(n2); - if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { - // Compare vectors element sizes for integer types. - return data_size(n1) == data_size(n2); - } - return vt1 == vt2; -} - bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const { return vloop().phase()->C->get_alias_index(m1->adr_type()) == vloop().phase()->C->get_alias_index(m2->adr_type()); diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index fa8563112bba5..efa101c4d060c 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -187,10 +187,9 @@ class SWNodeInfo { public: int _alignment; // memory alignment for a node int _depth; // Max expression (DAG) depth from block start - const Type* _velt_type; // vector element type Node_List* _my_pack; // pack containing this node - SWNodeInfo() : _alignment(-1), _depth(0), _velt_type(nullptr), _my_pack(nullptr) {} + SWNodeInfo() : _alignment(-1), _depth(0), _my_pack(nullptr) {} static const SWNodeInfo initial; }; @@ -250,7 +249,7 @@ class SuperWord : public ResourceObj { return vloop_analyzer().memory_slices().same_memory_slice(n1, n2); } - // VLoopAnalyzer body + // VLoopBody Accessors const GrowableArray& body() const { return vloop_analyzer().body().body(); } @@ -259,13 +258,33 @@ class SuperWord : public ResourceObj { return vloop_analyzer().body().bb_idx(n); } -#ifndef PRODUCT - // TraceAutoVectorization and TraceSuperWord - bool is_trace_superword_vector_element_type() const { - // Too verbose for TraceSuperWord - return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES); + // VLoopTypes Accessors + const Type* velt_type(Node* n) const { + return vloop_analyzer().types().velt_type(n); + } + + BasicType velt_basic_type(Node* n) const { + return vloop_analyzer().types().velt_basic_type(n); + } + + bool same_velt_type(Node* n1, Node* n2) const { + return vloop_analyzer().types().same_velt_type(n1, n2); + } + + int data_size(Node* n) const { + return vloop_analyzer().types().data_size(n); + } + + int vector_width(Node* n) const { + return vloop_analyzer().types().vector_width(n); } + int vector_width_in_bytes(const Node* n) const { + return vloop_analyzer().types().vector_width_in_bytes(n); + } + +#ifndef PRODUCT + // TraceAutoVectorization and TraceSuperWord bool is_trace_superword_alignment() const { // Too verbose for TraceSuperWord return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT); @@ -304,7 +323,6 @@ class SuperWord : public ResourceObj { bool is_trace_superword_any() const { return TraceSuperWord || is_trace_align_vector() || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_TYPES) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) || vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) || @@ -333,14 +351,6 @@ class SuperWord : public ResourceObj { // Accessors Arena* arena() { return &_arena; } - int vector_width(const Node* n) const { - BasicType bt = velt_basic_type(n); - return MIN2(ABS(iv_stride()), Matcher::max_vector_size(bt)); - } - int vector_width_in_bytes(const Node* n) const { - BasicType bt = velt_basic_type(n); - return vector_width(n)*type2aelembytes(bt); - } int get_vw_bytes_special(MemNode* s); const MemNode* align_to_ref() const { return _align_to_ref; } void set_align_to_ref(const MemNode* m) { _align_to_ref = m; } @@ -359,12 +369,6 @@ class SuperWord : public ResourceObj { int depth(Node* n) const { return _node_info.adr_at(bb_idx(n))->_depth; } void set_depth(Node* n, int d) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_depth = d; } - // vector element type - const Type* velt_type(const Node* n) const { return _node_info.adr_at(bb_idx(n))->_velt_type; } - BasicType velt_basic_type(const Node* n) const { return velt_type(n)->array_element_basic_type(); } - void set_velt_type(Node* n, const Type* t) { int i = bb_idx(n); grow_node_info(i); _node_info.adr_at(i)->_velt_type = t; } - bool same_velt_type(Node* n1, Node* n2); - // my_pack public: Node_List* my_pack(Node* n) { return !in_bb(n) ? nullptr : _node_info.adr_at(bb_idx(n))->_my_pack; } @@ -405,7 +409,6 @@ class SuperWord : public ResourceObj { // do s1 and s2 have similar input edges? bool have_similar_inputs(Node* s1, Node* s2); void set_alignment(Node* s1, Node* s2, int align); - int data_size(Node* s); // Extend packset by following use->def and def->use links from pack members. void extend_packlist(); int adjust_alignment_for_type_conversion(Node* s, Node* t, int align); @@ -460,8 +463,6 @@ class SuperWord : public ResourceObj { BasicType longer_type_for_conversion(Node* n); // Find the longest type in def-use chain for packed nodes, and then compute the max vector size. int max_vector_size_in_def_use_chain(Node* n); - // Compute necessary vector element type for expressions - void compute_vector_element_type(); // Are s1 and s2 in a pack pair and ordered as s1,s2? bool in_packset(Node* s1, Node* s2); // Remove the pack at position pos in the packset @@ -469,8 +470,6 @@ class SuperWord : public ResourceObj { static LoadNode::ControlDependency control_dependency(Node_List* p); // Alignment within a vector memory reference int memory_alignment(MemNode* s, int iv_adjust); - // Smallest type containing range of values - const Type* container_type(Node* n); // Ensure that the main loop vectors are aligned by adjusting the pre loop limit. void adjust_pre_loop_limit_to_align_main_loop_vectors(); // Is the use of d1 in u1 at the same operand position as d2 in u2? diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 5121634285b90..615f9230f3ae4 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -34,7 +34,7 @@ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(BODY, "Trace VLoopBody") \ - flags(SW_TYPES, "Trace SuperWord::compute_vector_element_type") \ + flags(TYPES, "Trace VLoopTypes") \ flags(SW_ALIGNMENT, "Trace SuperWord alignment analysis") \ flags(SW_DEPENDENCE_GRAPH, "Trace SuperWord::dependence_graph") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_refs") \ @@ -114,7 +114,6 @@ class TraceAutoVectorizationTagValidator { } else if (ALL == tag) { _tags.set_range(0, TRACE_AUTO_VECTORIZATION_TAG_NUM); } else if (SW_VERBOSE == tag) { - _tags.at_put(SW_TYPES, set_bit); _tags.at_put(SW_ALIGNMENT, set_bit); _tags.at_put(SW_DEPENDENCE_GRAPH, set_bit); _tags.at_put(SW_ADJACENT_MEMOPS, set_bit); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index d90dea194ecf8..0fbb337f26323 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -163,6 +163,8 @@ const char* VLoopAnalyzer::setup_submodules_helper() { return body_failure; } + _types.compute_vector_element_type(); + // TODO return VLoopAnalyzer::SUCCESS; } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 0ae2bb7a12da9..39ad48ce0f4f2 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -124,6 +124,10 @@ class VLoop : public StackObj { return vtrace().is_trace(TraceAutoVectorizationTag::BODY); } + bool is_trace_vector_element_type() const { + return vtrace().is_trace(TraceAutoVectorizationTag::TYPES); + } + bool is_trace_pointer_analysis() const { return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } @@ -330,6 +334,92 @@ class VLoopBody : public StackObj { } }; +// Submodule of VLoopAnalyzer. +// Compute the vector element type for every node in the loop body. +// We need to do this to be able to vectorize the narrower integer +// types (byte, char, short). In the C2 IR, their operations are +// done with full int type with 4 byte precision (e.g. AddI, MulI). +// Example: char a,b,c; a = (char)(b + c); +// However, if we can prove the the upper bits are only truncated, +// and the lower bits for the narrower type computed correctly, we +// can compute the operations in the narrower type directly (e.g we +// perform the AddI or MulI with 1 or 2 bytes). This allows us to +// fit more operations in a vector, and can remove the otherwise +// required conversion (int <-> narrower type). +// We compute the types backwards (use-to-def): If all use nodes +// only require the lower bits, then the def node can do the operation +// with only the lower bits, and we propagate the narrower type to it. +class VLoopTypes : public StackObj { +private: + const VLoop& _vloop; + const VLoopBody& _body; + + // bb_idx -> vector element type + GrowableArray _velt_type; + + const VLoop& vloop() const { return _vloop; } + const VLoopBody& body() const { return _body; } + +public: + VLoopTypes(Arena* arena, + const VLoop& vloop, + const VLoopBody& body) : + _vloop(vloop), + _body(body), + _velt_type(arena, vloop.estimated_body_length(), 0, nullptr) {} + NONCOPYABLE(VLoopTypes); + + void compute_vector_element_type(); + NOT_PRODUCT( void print() const; ) + + const Type* velt_type(const Node* n) const { + assert(vloop().in_bb(n), "only call on nodes in loop"); + const Type* t = _velt_type.at(body().bb_idx(n)); + assert(t != nullptr, "must have type"); + return t; + } + + BasicType velt_basic_type(const Node* n) const { + return velt_type(n)->array_element_basic_type(); + } + + int data_size(Node* s) const { + int bsize = type2aelembytes(velt_basic_type(s)); + assert(bsize != 0, "valid size"); + return bsize; + } + + bool same_velt_type(Node* n1, Node* n2) const { + const Type* vt1 = velt_type(n1); + const Type* vt2 = velt_type(n2); + if (vt1->basic_type() == T_INT && vt2->basic_type() == T_INT) { + // Compare vectors element sizes for integer types. + return data_size(n1) == data_size(n2); + } + return vt1 == vt2; + } + + int vector_width(const Node* n) const { + BasicType bt = velt_basic_type(n); + return MIN2(ABS(_vloop.iv_stride()), Matcher::max_vector_size(bt)); + } + + int vector_width_in_bytes(const Node* n) const { + BasicType bt = velt_basic_type(n); + return vector_width(n) * type2aelembytes(bt); + } + +private: + void set_velt_type(Node* n, const Type* t) { + assert(t != nullptr, "cannot set nullptr"); + assert(vloop().in_bb(n), "only call on nodes in loop"); + _velt_type.at_put(body().bb_idx(n), t); + } + + // Smallest type containing range of values + const Type* container_type(Node* n) const; +}; + // Analyze the loop in preparation for auto-vectorization. This class is // deliberately structured into many submodules, which are as independent // as possible, though some submodules do require other submodules. @@ -340,21 +430,21 @@ class VLoopAnalyzer : StackObj { static constexpr char const* FAILURE_NO_MAX_UNROLL = "slp max unroll analysis required"; static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop"; - const VLoop& _vloop; + const VLoop& _vloop; // Arena for all submodules - Arena _arena; + Arena _arena; // If all submodules are setup successfully, we set this flag at the // end of the constructor - bool _success; + bool _success; // Submodules // TODO - VLoopReductions _reductions; + VLoopReductions _reductions; VLoopMemorySlices _memory_slices; VLoopBody _body; - //VLoopTypes _types; + VLoopTypes _types; //VLoopDependenceGraph _dependence_graph; public: @@ -364,8 +454,8 @@ class VLoopAnalyzer : StackObj { _success(false), _reductions (&_arena, vloop), _memory_slices (&_arena, vloop), - _body (&_arena, vloop, vshared) - //_types (&_arena, vloop, body()), + _body (&_arena, vloop, vshared), + _types (&_arena, vloop, body()) //_dependence_graph(&_arena, vloop, memory_slices(), body()) // TODO modules { @@ -382,7 +472,7 @@ class VLoopAnalyzer : StackObj { const VLoopReductions& reductions() const { return _reductions; } const VLoopMemorySlices& memory_slices() const { return _memory_slices; } const VLoopBody& body() const { return _body; } - //const VLoopTypes& types() const { return _types; } + const VLoopTypes& types() const { return _types; } //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; } // TODO From a69bacf662d4bf305e7a10f9461f0c009978a3ea Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 10 Feb 2024 23:03:54 +0100 Subject: [PATCH 06/13] remove some comments --- src/hotspot/share/opto/superword.cpp | 3 --- src/hotspot/share/opto/vectorization.cpp | 1 - src/hotspot/share/opto/vectorization.hpp | 7 ------- 3 files changed, 11 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 39344aee3a7b4..52c63dcc68943 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -376,7 +376,6 @@ void VLoopReductions::mark_reductions() { } // Reduction cycle found. Mark all nodes in the found path as reductions. current = first; - // TODO trace this for (int i = 0; i < path_nodes; i++) { _loop_reductions.set(current->_idx); current = original_input(current, reduction_input); @@ -449,8 +448,6 @@ bool SuperWord::transform_loop() { bool SuperWord::SLP_extract() { assert(cl()->is_main_loop(), "SLP should only work on main loops"); - // TODO remove all the VLoopAnalyzer stuff - // Ensure extra info is allocated. initialize_node_info(); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 0fbb337f26323..93bdecb02e257 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -165,7 +165,6 @@ const char* VLoopAnalyzer::setup_submodules_helper() { _types.compute_vector_element_type(); - // TODO return VLoopAnalyzer::SUCCESS; } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 39ad48ce0f4f2..1d32ff694a660 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -425,7 +425,6 @@ class VLoopTypes : public StackObj { // as possible, though some submodules do require other submodules. class VLoopAnalyzer : StackObj { private: - // TODO check if all are really needed static constexpr char const* SUCCESS = "success"; static constexpr char const* FAILURE_NO_MAX_UNROLL = "slp max unroll analysis required"; static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop"; @@ -440,12 +439,10 @@ class VLoopAnalyzer : StackObj { bool _success; // Submodules - // TODO VLoopReductions _reductions; VLoopMemorySlices _memory_slices; VLoopBody _body; VLoopTypes _types; - //VLoopDependenceGraph _dependence_graph; public: VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) : @@ -456,8 +453,6 @@ class VLoopAnalyzer : StackObj { _memory_slices (&_arena, vloop), _body (&_arena, vloop, vshared), _types (&_arena, vloop, body()) - //_dependence_graph(&_arena, vloop, memory_slices(), body()) - // TODO modules { _success = setup_submodules(); } @@ -473,8 +468,6 @@ class VLoopAnalyzer : StackObj { const VLoopMemorySlices& memory_slices() const { return _memory_slices; } const VLoopBody& body() const { return _body; } const VLoopTypes& types() const { return _types; } - //const VLoopDependenceGraph& dependence_graph() const { return _dependence_graph; } - // TODO private: bool setup_submodules(); From b43d513b2f0fff1ffaad4bb3718798706eb46d0d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sun, 11 Feb 2024 14:46:24 +0100 Subject: [PATCH 07/13] move _loop_or_ctrl from ResouceArena, bc ResourceMark in SuperWord::dependence_graph --- src/hotspot/share/opto/loopnode.hpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 3b281e0f77de4..b1a0d95ddf266 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -1098,6 +1098,7 @@ class PhaseIdealLoop : public PhaseTransform { // Compute the Ideal Node to Loop mapping PhaseIdealLoop(PhaseIterGVN& igvn, LoopOptsMode mode) : PhaseTransform(Ideal_Loop), + _loop_or_ctrl(igvn.C->comp_arena()), _igvn(igvn), _verify_me(nullptr), _verify_only(false), @@ -1112,6 +1113,7 @@ class PhaseIdealLoop : public PhaseTransform { // or only verify that the graph is valid if verify_me is null. PhaseIdealLoop(PhaseIterGVN& igvn, const PhaseIdealLoop* verify_me = nullptr) : PhaseTransform(Ideal_Loop), + _loop_or_ctrl(igvn.C->comp_arena()), _igvn(igvn), _verify_me(verify_me), _verify_only(verify_me == nullptr), From 19acaef2ccd6ea7fd1eb66abeea466dc00c76228 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 14 Feb 2024 17:00:29 +0100 Subject: [PATCH 08/13] VStatus --- src/hotspot/share/opto/superword.cpp | 6 +- src/hotspot/share/opto/vectorization.cpp | 72 +++++++++++------------- src/hotspot/share/opto/vectorization.hpp | 30 ++++++++-- 3 files changed, 62 insertions(+), 46 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 04b9c910635a5..b0619af928ff6 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2929,7 +2929,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) { } // Return nullptr if success, else failure message -const char* VLoopBody::construct() { +VStatus VLoopBody::construct() { assert(_body.length() == 0, "body is empty"); // First pass over loop body: @@ -2953,7 +2953,7 @@ const char* VLoopBody::construct() { n->dump(); } #endif - return VLoopBody::FAILURE_NODE_NOT_ALLOWED; + return VStatus::make_failure(VLoopBody::FAILURE_NODE_NOT_ALLOWED); } #ifdef ASSERT @@ -3025,7 +3025,7 @@ const char* VLoopBody::construct() { #endif assert(rpo_idx == -1 && body_count == _body.length(), "all body members found"); - return nullptr; // success + return VStatus::make_success(); } // Initialize per node info diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 93bdecb02e257..5f268bced090f 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -40,40 +40,38 @@ bool VLoop::check_preconditions() { } #endif - const char* return_state = check_preconditions_helper(); - assert(return_state != nullptr, "must have return state"); - if (return_state == VLoop::SUCCESS) { - return true; // success - } - + VStatus status = check_preconditions_helper(); + if (!status.is_success()) { #ifndef PRODUCT - if (is_trace_preconditions()) { - tty->print_cr("VLoop::check_preconditions: failed: %s", return_state); - } + if (is_trace_preconditions()) { + tty->print_cr("VLoop::check_preconditions: failed: %s", status.failure_reason()); + } #endif - return false; // failure + return false; // failure + } + return true; // success } -const char* VLoop::check_preconditions_helper() { +VStatus VLoop::check_preconditions_helper() { // Only accept vector width that is power of 2 int vector_width = Matcher::vector_width_in_bytes(T_BYTE); if (vector_width < 2 || !is_power_of_2(vector_width)) { - return VLoop::FAILURE_VECTOR_WIDTH; + return VStatus::make_failure(VLoop::FAILURE_VECTOR_WIDTH); } // Only accept valid counted loops (int) if (!_lpt->_head->as_Loop()->is_valid_counted_loop(T_INT)) { - return VLoop::FAILURE_VALID_COUNTED_LOOP; + return VStatus::make_failure(VLoop::FAILURE_VALID_COUNTED_LOOP); } _cl = _lpt->_head->as_CountedLoop(); _iv = _cl->phi()->as_Phi(); if (_cl->is_vectorized_loop()) { - return VLoop::FAILURE_ALREADY_VECTORIZED; + return VStatus::make_failure(VLoop::FAILURE_ALREADY_VECTORIZED); } if (_cl->is_unroll_only()) { - return VLoop::FAILURE_UNROLL_ONLY; + return VStatus::make_failure(VLoop::FAILURE_UNROLL_ONLY); } // Check for control flow in the body @@ -89,12 +87,12 @@ const char* VLoop::check_preconditions_helper() { _lpt->dump_head(); } #endif - return VLoop::FAILURE_CONTROL_FLOW; + return VStatus::make_failure(VLoop::FAILURE_CONTROL_FLOW); } // Make sure the are no extra control users of the loop backedge if (_cl->back_control()->outcnt() != 1) { - return VLoop::FAILURE_BACKEDGE; + return VStatus::make_failure(VLoop::FAILURE_BACKEDGE); } // To align vector memory accesses in the main-loop, we will have to adjust @@ -102,16 +100,16 @@ const char* VLoop::check_preconditions_helper() { if (_cl->is_main_loop()) { CountedLoopEndNode* pre_end = _cl->find_pre_loop_end(); if (pre_end == nullptr) { - return VLoop::FAILURE_PRE_LOOP_LIMIT; + return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT); } Node* pre_opaq1 = pre_end->limit(); if (pre_opaq1->Opcode() != Op_Opaque1) { - return VLoop::FAILURE_PRE_LOOP_LIMIT; + return VStatus::make_failure(VLoop::FAILURE_PRE_LOOP_LIMIT); } _pre_loop_end = pre_end; } - return VLoop::SUCCESS; + return VStatus::make_success(); } // Return true iff all submodules are loaded successfully @@ -124,24 +122,22 @@ bool VLoopAnalyzer::setup_submodules() { } #endif - const char* state = setup_submodules_helper(); - if (state == VLoopAnalyzer::SUCCESS) { - return true; // success - } - + VStatus status = setup_submodules_helper(); + if (!status.is_success()) { #ifndef PRODUCT - if (vloop().is_trace_loop_analyzer()) { - tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", state); - } + if (vloop().is_trace_loop_analyzer()) { + tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason()); + } #endif - return false; // failed + return false; // failed + } + return true; // success } -// Return SUCCESS string iff all submodules are setup successfully -const char* VLoopAnalyzer::setup_submodules_helper() { +VStatus VLoopAnalyzer::setup_submodules_helper() { // Skip any loop that has not been assigned max unroll by analysis. if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) { - return VLoopAnalyzer::FAILURE_NO_MAX_UNROLL; + return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL); } if (SuperWordReductions) { @@ -150,22 +146,22 @@ const char* VLoopAnalyzer::setup_submodules_helper() { _memory_slices.find_memory_slices(); - // If there is no memory slice detected, that means there is no store. + // If there is no memory slice detected, it means there is no store. // If there is no reduction and no store, then we give up, because // vectorization is not possible anyway (given current limitations). if (!reductions().is_marked_reduction_loop() && _memory_slices.heads().is_empty()) { - return VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE; + return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE); } - const char* body_failure = _body.construct(); - if (body_failure != nullptr) { - return body_failure; + VStatus body_status = _body.construct(); + if (!body_status.is_success()) { + return body_status; } _types.compute_vector_element_type(); - return VLoopAnalyzer::SUCCESS; + return VStatus::make_success(); } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 1d32ff694a660..7b2f3e340db40 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -33,6 +33,28 @@ // Code in this file and the vectorization.cpp contains shared logics and // utilities for C2's loop auto-vectorization. +class VStatus : public StackObj { +private: + const char* _failure_reason; + + VStatus(const char* failure_reason) : _failure_reason(failure_reason) {} + +public: + static VStatus make_success() { return VStatus(nullptr); } + + static VStatus make_failure(const char* failure_reason) { + assert(failure_reason != nullptr, "must have reason"); + return VStatus(failure_reason); + } + + bool is_success() const { return _failure_reason == nullptr; } + + const char* failure_reason() const { + assert(!is_success(), "only failures have reason"); + return _failure_reason; + } +}; + #ifndef PRODUCT // Access to TraceAutoVectorization tags class VTrace : public StackObj { @@ -62,7 +84,6 @@ class VLoop : public StackObj { NOT_PRODUCT(VTrace _vtrace;) - static constexpr char const* SUCCESS = "success"; static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized"; static constexpr char const* FAILURE_UNROLL_ONLY = "loop only wants to be unrolled"; static constexpr char const* FAILURE_VECTOR_WIDTH = "vector_width must be power of 2"; @@ -145,7 +166,7 @@ class VLoop : public StackObj { bool check_preconditions(); private: - const char* check_preconditions_helper(); + VStatus check_preconditions_helper(); }; // Optimization to keep allocation of large arrays in AutoVectorization low. @@ -319,7 +340,7 @@ class VLoopBody : public StackObj { NONCOPYABLE(VLoopBody); - const char* construct(); + VStatus construct(); const GrowableArray& body() const { return _body; } NOT_PRODUCT( void print() const; ) @@ -425,7 +446,6 @@ class VLoopTypes : public StackObj { // as possible, though some submodules do require other submodules. class VLoopAnalyzer : StackObj { private: - static constexpr char const* SUCCESS = "success"; static constexpr char const* FAILURE_NO_MAX_UNROLL = "slp max unroll analysis required"; static constexpr char const* FAILURE_NO_REDUCTION_OR_STORE = "no reduction and no store in loop"; @@ -471,7 +491,7 @@ class VLoopAnalyzer : StackObj { private: bool setup_submodules(); - const char* setup_submodules_helper(); + VStatus setup_submodules_helper(); }; // A vectorization pointer (VPointer) has information about an address for From 6c28172c8f4c7440e1562aaa1974446e934245a3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 14 Feb 2024 17:07:09 +0100 Subject: [PATCH 09/13] Apply suggestions from code review thanks Christian Co-authored-by: Christian Hagedorn --- src/hotspot/share/opto/superword.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index b0619af928ff6..becf53b8ac402 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -758,14 +758,14 @@ void SuperWord::dependence_graph() { // First, assign a dependence node to each memory node for (int i = 0; i < body().length(); i++ ) { - Node *n = body().at(i); + Node* n = body().at(i); if (n->is_Mem() || n->is_memory_phi()) { _dg.make_node(n); } } - const GrowableArray &mem_slice_head = vloop_analyzer().memory_slices().heads(); - const GrowableArray &mem_slice_tail = vloop_analyzer().memory_slices().tails(); + const GrowableArray& mem_slice_head = vloop_analyzer().memory_slices().heads(); + const GrowableArray& mem_slice_tail = vloop_analyzer().memory_slices().tails(); ResourceMark rm; GrowableArray slice_nodes; From ba07a799a626b3ef8e50de7d06999c9acd686041 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 14 Feb 2024 17:12:34 +0100 Subject: [PATCH 10/13] more for Christian --- src/hotspot/share/opto/vectorization.hpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 7b2f3e340db40..827a9a3c26df5 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -258,23 +258,29 @@ class VLoopReductions : public StackObj { // that it assumes counted loops and requires that reduction nodes are not // used within the loop except by their reduction cycle predecessors. void mark_reductions(); + // Whether n is a reduction operator and part of a reduction cycle. // This function can be used for individual queries outside auto-vectorization, // e.g. to inform matching in target-specific code. Otherwise, the // almost-equivalent but faster mark_reductions() is preferable. static bool is_reduction(const Node* n); + // Whether n is marked as a reduction node. bool is_marked_reduction(const Node* n) const { return _loop_reductions.test(n->_idx); } + bool is_marked_reduction_loop() const { return !_loop_reductions.is_empty(); } + // Are s1 and s2 reductions with a data path between them? bool is_marked_reduction_pair(Node* s1, Node* s2) const; private: // Whether n is a standard reduction operator. static bool is_reduction_operator(const Node* n); + // Whether n is part of a reduction cycle via the 'input' edge index. To bound // the search, constrain the size of reduction cycles to LoopMaxUnroll. static bool in_reduction_cycle(const Node* n, uint input); + // Reference to the i'th input node of n, commuting the inputs of binary nodes // whose edges have been swapped. Assumes n is a commutative operation. static Node* original_input(const Node* n, uint i); @@ -480,8 +486,6 @@ class VLoopAnalyzer : StackObj { bool success() const { return _success; } - Arena* arena() { return &_arena; } - // Read-only accessors for submodules const VLoop& vloop() const { return _vloop; } const VLoopReductions& reductions() const { return _reductions; } From 1d771fdda4f3d6bbce009939762db883c00986e4 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 15 Feb 2024 19:26:49 +0100 Subject: [PATCH 11/13] remove accessors, use fields instead --- src/hotspot/share/opto/superword.cpp | 113 ++++++++++++----------- src/hotspot/share/opto/superword.hpp | 69 +++++++------- src/hotspot/share/opto/vectorization.cpp | 12 +-- src/hotspot/share/opto/vectorization.hpp | 39 +++----- 4 files changed, 111 insertions(+), 122 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index becf53b8ac402..e4cce5f67fa9e 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -40,9 +40,10 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : _vloop_analyzer(vloop_analyzer), + _vloop(vloop_analyzer.vloop()), _arena(mtCompiler), _packset(arena(), 8, 0, nullptr), // packs for the current block - _node_info(arena(), vloop().estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node + _node_info(arena(), _vloop.estimated_body_length(), 0, SWNodeInfo::initial), // info needed per node _clone_map(phase()->C->clone_map()), // map of nodes created in cloning _align_to_ref(nullptr), // memory reference to align vectors to _dg(arena()), // dependence graph @@ -299,7 +300,7 @@ Node* VLoopReductions::original_input(const Node* n, uint i) { void VLoopReductions::mark_reductions() { assert(_loop_reductions.is_empty(), "must not yet be computed"); - CountedLoopNode* cl = vloop().cl(); + CountedLoopNode* cl = _vloop.cl(); // Iterate through all phi nodes associated to the loop and search for // reduction cycles in the basic block. @@ -311,7 +312,7 @@ void VLoopReductions::mark_reductions() { if (phi->outcnt() == 0) { continue; } - if (phi == vloop().iv()) { + if (phi == _vloop.iv()) { continue; } // The phi's loop-back is considered the first node in the reduction cycle. @@ -335,9 +336,9 @@ void VLoopReductions::mark_reductions() { // to the phi node following edge index 'input'. PathEnd path = find_in_path( - first, input, vloop().lpt()->_body.size(), + first, input, _vloop.lpt()->_body.size(), [&](const Node* n) { return n->Opcode() == first->Opcode() && - vloop().in_bb(n); }, + _vloop.in_bb(n); }, [&](const Node* n) { return n == phi; }); if (path.first != nullptr) { reduction_input = input; @@ -356,7 +357,7 @@ void VLoopReductions::mark_reductions() { for (int i = 0; i < path_nodes; i++) { for (DUIterator_Fast jmax, j = current->fast_outs(jmax); j < jmax; j++) { Node* u = current->fast_out(j); - if (!vloop().in_bb(u)) { + if (!_vloop.in_bb(u)) { continue; } if (u == succ) { @@ -533,13 +534,13 @@ void SuperWord::find_adjacent_refs() { set_align_to_ref(align_to_mem_ref); } - VPointer align_to_ref_p(mem_ref, vloop()); + VPointer align_to_ref_p(mem_ref, _vloop); // Set alignment relative to "align_to_ref" for all related memory operations. for (int i = memops.size() - 1; i >= 0; i--) { MemNode* s = memops.at(i)->as_Mem(); if (isomorphic(s, mem_ref) && (!_do_vector_loop || same_origin_idx(s, mem_ref))) { - VPointer p2(s, vloop()); + VPointer p2(s, _vloop); if (p2.comparable(align_to_ref_p)) { int align = memory_alignment(s, iv_adjustment); set_alignment(s, align); @@ -598,11 +599,11 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { // Count number of comparable memory ops for (uint i = 0; i < memops.size(); i++) { MemNode* s1 = memops.at(i)->as_Mem(); - VPointer p1(s1, vloop()); + VPointer p1(s1, _vloop); for (uint j = i+1; j < memops.size(); j++) { MemNode* s2 = memops.at(j)->as_Mem(); if (isomorphic(s1, s2)) { - VPointer p2(s2, vloop()); + VPointer p2(s2, _vloop); if (p1.comparable(p2)) { (*cmp_ct.adr_at(i))++; (*cmp_ct.adr_at(j))++; @@ -623,7 +624,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { if (s->is_Store()) { int vw = vector_width_in_bytes(s); assert(vw > 1, "sanity"); - VPointer p(s, vloop()); + VPointer p(s, _vloop); if ( cmp_ct.at(j) > max_ct || (cmp_ct.at(j) == max_ct && ( vw > max_vw || @@ -646,7 +647,7 @@ MemNode* SuperWord::find_align_to_ref(Node_List &memops, int &idx) { if (s->is_Load()) { int vw = vector_width_in_bytes(s); assert(vw > 1, "sanity"); - VPointer p(s, vloop()); + VPointer p(s, _vloop); if ( cmp_ct.at(j) > max_ct || (cmp_ct.at(j) == max_ct && ( vw > max_vw || @@ -719,7 +720,7 @@ int SuperWord::get_vw_bytes_special(MemNode* s) { //---------------------------get_iv_adjustment--------------------------- // Calculate loop's iv adjustment for this memory ops. int SuperWord::get_iv_adjustment(MemNode* mem_ref) { - VPointer align_to_ref_p(mem_ref, vloop()); + VPointer align_to_ref_p(mem_ref, _vloop); int offset = align_to_ref_p.offset_in_bytes(); int scale = align_to_ref_p.scale_in_bytes(); int elt_size = align_to_ref_p.memory_size(); @@ -764,8 +765,8 @@ void SuperWord::dependence_graph() { } } - const GrowableArray& mem_slice_head = vloop_analyzer().memory_slices().heads(); - const GrowableArray& mem_slice_tail = vloop_analyzer().memory_slices().tails(); + const GrowableArray& mem_slice_head = _vloop_analyzer.memory_slices().heads(); + const GrowableArray& mem_slice_tail = _vloop_analyzer.memory_slices().tails(); ResourceMark rm; GrowableArray slice_nodes; @@ -776,7 +777,7 @@ void SuperWord::dependence_graph() { MemNode* tail = mem_slice_tail.at(i); // Get slice in predecessor order (last is first) - vloop_analyzer().memory_slices().get_slice(head, tail, slice_nodes); + _vloop_analyzer.memory_slices().get_slice(head, tail, slice_nodes); // Make the slice dependent on the root DepMem* slice = _dg.dep(head); @@ -794,13 +795,13 @@ void SuperWord::dependence_graph() { if (_dg.dep(s1)->in_cnt() == 0) { _dg.make_edge(slice, s1); } - VPointer p1(s1->as_Mem(), vloop()); + VPointer p1(s1->as_Mem(), _vloop); bool sink_dependent = true; for (int k = j - 1; k >= 0; k--) { Node* s2 = slice_nodes.at(k); if (s1->is_Load() && s2->is_Load()) continue; - VPointer p2(s2->as_Mem(), vloop()); + VPointer p2(s2->as_Mem(), _vloop); int cmp = p1.cmp(p2); if (!VPointer::not_equal(cmp)) { @@ -831,12 +832,12 @@ void SuperWord::dependence_graph() { void VLoopMemorySlices::find_memory_slices() { assert(_heads.is_empty(), "not yet computed"); assert(_tails.is_empty(), "not yet computed"); - CountedLoopNode* cl = vloop().cl(); + CountedLoopNode* cl = _vloop.cl(); // Iterate over all memory phis for (DUIterator_Fast imax, i = cl->fast_outs(imax); i < imax; i++) { PhiNode* phi = cl->fast_out(i)->isa_Phi(); - if (phi != nullptr && vloop().in_bb(phi) && phi->is_memory_phi()) { + if (phi != nullptr && _vloop.in_bb(phi) && phi->is_memory_phi()) { Node* phi_tail = phi->in(LoopNode::LoopBackControl); if (phi_tail != phi->in(LoopNode::EntryControl)) { _heads.push(phi); @@ -845,7 +846,7 @@ void VLoopMemorySlices::find_memory_slices() { } } - NOT_PRODUCT( if (vloop().is_trace_memory_slices()) { print(); } ) + NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } ) } #ifndef PRODUCT @@ -865,19 +866,19 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArrayfast_outs(imax); i < imax; i++) { Node* out = n->fast_out(i); if (out->is_Load()) { - if (vloop().in_bb(out)) { + if (_vloop.in_bb(out)) { slice.push(out); } } else { // FIXME - if (out->is_MergeMem() && !vloop().in_bb(out)) { + if (out->is_MergeMem() && !_vloop.in_bb(out)) { // Either unrolling is causing a memory edge not to disappear, // or need to run igvn.optimize() again before SLP - } else if (out->is_memory_phi() && !vloop().in_bb(out)) { + } else if (out->is_memory_phi() && !_vloop.in_bb(out)) { // Ditto. Not sure what else to check further. } else if (out->Opcode() == Op_StoreCM && out->in(MemNode::OopStore) == n) { // StoreCM has an input edge used as a precedence edge. @@ -895,7 +896,7 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArrayprint_cr("\nVLoopMemorySlices::get_slice:"); head->dump(); for (int j = slice.length() - 1; j >= 0 ; j--) { @@ -970,8 +971,8 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) { // Adjacent memory references must have the same base, be comparable // and have the correct distance between them. - VPointer p1(s1->as_Mem(), vloop()); - VPointer p2(s2->as_Mem(), vloop()); + VPointer p1(s1->as_Mem(), _vloop); + VPointer p2(s2->as_Mem(), _vloop); if (p1.base() != p2.base() || !p1.comparable(p2)) return false; int diff = p2.offset_in_bytes() - p1.offset_in_bytes(); return diff == data_size(s1); @@ -1602,8 +1603,8 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs"); const MemNode* mem_ref = pack->at(0)->as_Mem(); - VPointer mem_ref_p(mem_ref, vloop()); - const CountedLoopEndNode* pre_end = vloop().pre_loop_end(); + VPointer mem_ref_p(mem_ref, _vloop); + const CountedLoopEndNode* pre_end = _vloop.pre_loop_end(); assert(pre_end->stride_is_con(), "pre loop stride is constant"); AlignmentSolver solver(pack->at(0)->as_Mem(), @@ -2273,7 +2274,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) { // loop we may have a different last store, and we need to adjust the uses accordingly. GrowableArray old_last_store_in_slice(max_slices, max_slices, nullptr); - const GrowableArray &mem_slice_head = vloop_analyzer().memory_slices().heads(); + const GrowableArray &mem_slice_head = _vloop_analyzer.memory_slices().heads(); // (1) Set up the initial memory state from Phi. And find the old last store. for (int i = 0; i < mem_slice_head.length(); i++) { @@ -2394,7 +2395,7 @@ bool SuperWord::output() { // Walk up the memory chain, and ignore any StoreVector that provably // does not have any memory dependency. while (mem->is_StoreVector()) { - VPointer p_store(mem->as_Mem(), vloop()); + VPointer p_store(mem->as_Mem(), _vloop); if (p_store.overlap_possible_with_any_in(p)) { break; } else { @@ -2937,10 +2938,10 @@ VStatus VLoopBody::construct() { // (2) Count number of nodes, and create a temporary map (_idx -> bb_idx). // (3) Verify that all non-ctrl nodes have an input inside the loop. int body_count = 0; - for (uint i = 0; i < vloop().lpt()->_body.size(); i++) { - Node* n = vloop().lpt()->_body.at(i); + for (uint i = 0; i < _vloop.lpt()->_body.size(); i++) { + Node* n = _vloop.lpt()->_body.at(i); set_bb_idx(n, i); // Create a temporary map - if (vloop().in_bb(n)) { + if (_vloop.in_bb(n)) { body_count++; if (n->is_LoadStore() || n->is_MergeMem() || @@ -2948,7 +2949,7 @@ VStatus VLoopBody::construct() { // Bailout if the loop has LoadStore, MergeMem or data Proj // nodes. Superword optimization does not work with them. #ifndef PRODUCT - if (vloop().is_trace_body()) { + if (_vloop.is_trace_body()) { tty->print_cr("VLoopBody::construct: fails because of unhandled node:"); n->dump(); } @@ -2961,7 +2962,7 @@ VStatus VLoopBody::construct() { bool found = false; for (uint j = 0; j < n->req(); j++) { Node* def = n->in(j); - if (def != nullptr && vloop().in_bb(def)) { + if (def != nullptr && _vloop.in_bb(def)) { found = true; break; } @@ -2978,8 +2979,8 @@ VStatus VLoopBody::construct() { VectorSet visited; VectorSet post_visited; - visited.set(bb_idx(vloop().cl())); - stack.push(vloop().cl()); + visited.set(bb_idx(_vloop.cl())); + stack.push(_vloop.cl()); // Do a depth first walk over out edges int rpo_idx = body_count - 1; @@ -2992,9 +2993,9 @@ VStatus VLoopBody::construct() { const int old_length = stack.length(); for (DUIterator_Fast imax, i = n->fast_outs(imax); i < imax; i++) { Node* use = n->fast_out(i); - if (vloop().in_bb(use) && !visited.test(bb_idx(use)) && + if (_vloop.in_bb(use) && !visited.test(bb_idx(use)) && // Don't go around backedge - (!use->is_Phi() || n == vloop().cl())) { + (!use->is_Phi() || n == _vloop.cl())) { stack.push(use); } } @@ -3019,7 +3020,7 @@ VStatus VLoopBody::construct() { } #ifndef PRODUCT - if (vloop().is_trace_body()) { + if (_vloop.is_trace_body()) { print(); } #endif @@ -3120,7 +3121,7 @@ int SuperWord::max_vector_size_in_def_use_chain(Node* n) { void VLoopTypes::compute_vector_element_type() { #ifndef PRODUCT - if (vloop().is_trace_vector_element_type()) { + if (_vloop.is_trace_vector_element_type()) { tty->print_cr("\nVLoopTypes::compute_vector_element_type:"); } #endif @@ -3151,13 +3152,13 @@ void VLoopTypes::compute_vector_element_type() { Node* in = n->in(j); // Don't propagate through a memory if (!in->is_Mem() && - vloop().in_bb(in) && + _vloop.in_bb(in) && velt_type(in)->basic_type() == T_INT && data_size(n) < data_size(in)) { bool same_type = true; for (DUIterator_Fast kmax, k = in->fast_outs(kmax); k < kmax; k++) { Node *use = in->fast_out(k); - if (!vloop().in_bb(use) || !same_velt_type(use, n)) { + if (!_vloop.in_bb(use) || !same_velt_type(use, n)) { same_type = false; break; } @@ -3175,7 +3176,7 @@ void VLoopTypes::compute_vector_element_type() { if (VectorNode::is_shift_opcode(op) || op == Op_AbsI || op == Op_ReverseBytesI) { Node* load = in->in(1); if (load->is_Load() && - vloop().in_bb(load) && + _vloop.in_bb(load) && (velt_type(load)->basic_type() == T_INT)) { // Only Load nodes distinguish signed (LoadS/LoadB) and unsigned // (LoadUS/LoadUB) values. Store nodes only have one version. @@ -3200,9 +3201,9 @@ void VLoopTypes::compute_vector_element_type() { assert(nn->is_Cmp(), "always have Cmp above Bool"); } if (nn->is_Cmp() && nn->in(0) == nullptr) { - assert(vloop().in_bb(nn->in(1)) || vloop().in_bb(nn->in(2)), + assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)), "one of the inputs must be in the loop too"); - if (vloop().in_bb(nn->in(1))) { + if (_vloop.in_bb(nn->in(1))) { set_velt_type(n, velt_type(nn->in(1))); } else { set_velt_type(n, velt_type(nn->in(2))); @@ -3210,7 +3211,7 @@ void VLoopTypes::compute_vector_element_type() { } } #ifndef PRODUCT - if (vloop().is_trace_vector_element_type()) { + if (_vloop.is_trace_vector_element_type()) { for (int i = 0; i < body.length(); i++) { Node* n = body.at(i); velt_type(n)->dump(); @@ -3229,7 +3230,7 @@ int SuperWord::memory_alignment(MemNode* s, int iv_adjust) { tty->print("SuperWord::memory_alignment within a vector memory reference for %d: ", s->_idx); s->dump(); } #endif - VPointer p(s, vloop()); + VPointer p(s, _vloop); if (!p.valid()) { NOT_PRODUCT(if(is_trace_superword_alignment()) tty->print_cr("SuperWord::memory_alignment: VPointer p invalid, return bottom_align");) return bottom_align; @@ -3269,7 +3270,7 @@ const Type* VLoopTypes::container_type(Node* n) const { } return Type::get_const_basic_type(bt); } - const Type* t = vloop().phase()->igvn().type(n); + const Type* t = _vloop.phase()->igvn().type(n); if (t->basic_type() == T_INT) { // A narrow type of arithmetic operations will be determined by // propagating the type of memory operations. @@ -3279,8 +3280,8 @@ const Type* VLoopTypes::container_type(Node* n) const { } bool VLoopMemorySlices::same_memory_slice(MemNode* m1, MemNode* m2) const { - return vloop().phase()->C->get_alias_index(m1->adr_type()) == - vloop().phase()->C->get_alias_index(m2->adr_type()); + return _vloop.phase()->C->get_alias_index(m1->adr_type()) == + _vloop.phase()->C->get_alias_index(m2->adr_type()); } //------------------------------in_packset--------------------------- @@ -3363,19 +3364,19 @@ void SuperWord::adjust_pre_loop_limit_to_align_main_loop_vectors() { assert(cl()->is_main_loop(), "can only do alignment for main loop"); // The opaque node for the limit, where we adjust the input - Opaque1Node* pre_opaq = vloop().pre_loop_end()->limit()->as_Opaque1(); + Opaque1Node* pre_opaq = _vloop.pre_loop_end()->limit()->as_Opaque1(); // Current pre-loop limit. Node* old_limit = pre_opaq->in(1); // Where we put new limit calculations. - Node* pre_ctrl = vloop().pre_loop_head()->in(LoopNode::EntryControl); + Node* pre_ctrl = _vloop.pre_loop_head()->in(LoopNode::EntryControl); // Ensure the original loop limit is available from the pre-loop Opaque1 node. Node* orig_limit = pre_opaq->original_loop_limit(); assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, ""); - VPointer align_to_ref_p(align_to_ref, vloop()); + VPointer align_to_ref_p(align_to_ref, _vloop); assert(align_to_ref_p.valid(), "sanity"); // For the main-loop, we want the address of align_to_ref to be memory aligned diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index af9fae308438d..ca3c4284f9ee6 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -198,6 +198,7 @@ class SWNodeInfo { class SuperWord : public ResourceObj { private: const VLoopAnalyzer& _vloop_analyzer; + const VLoop& _vloop; // Arena for small data structures. Large data structures are allocated in // VSharedData, and reused over many AutoVectorizations. @@ -222,118 +223,114 @@ class SuperWord : public ResourceObj { // Decide if loop can eventually be vectorized, and what unrolling factor is required. static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor); - // VLoopAnalyzer Accessors - const VLoopAnalyzer& vloop_analyzer() const { return _vloop_analyzer; } - // VLoop Accessors - const VLoop& vloop() const { return vloop_analyzer().vloop(); } - PhaseIdealLoop* phase() const { return vloop().phase(); } - PhaseIterGVN& igvn() const { return vloop().phase()->igvn(); } - IdealLoopTree* lpt() const { return vloop().lpt(); } - CountedLoopNode* cl() const { return vloop().cl(); } - PhiNode* iv() const { return vloop().iv(); } + PhaseIdealLoop* phase() const { return _vloop.phase(); } + PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } + IdealLoopTree* lpt() const { return _vloop.lpt(); } + CountedLoopNode* cl() const { return _vloop.cl(); } + PhiNode* iv() const { return _vloop.iv(); } int iv_stride() const { return cl()->stride_con(); } - bool in_bb(const Node* n) const { return vloop().in_bb(n); } + bool in_bb(const Node* n) const { return _vloop.in_bb(n); } // VLoopReductions Accessors bool is_marked_reduction(const Node* n) const { - return vloop_analyzer().reductions().is_marked_reduction(n); + return _vloop_analyzer.reductions().is_marked_reduction(n); } bool reduction(Node* n1, Node* n2) const { - return vloop_analyzer().reductions().is_marked_reduction_pair(n1, n2); + return _vloop_analyzer.reductions().is_marked_reduction_pair(n1, n2); } // VLoopMemorySlices Accessors bool same_memory_slice(MemNode* n1, MemNode* n2) const { - return vloop_analyzer().memory_slices().same_memory_slice(n1, n2); + return _vloop_analyzer.memory_slices().same_memory_slice(n1, n2); } // VLoopBody Accessors const GrowableArray& body() const { - return vloop_analyzer().body().body(); + return _vloop_analyzer.body().body(); } int bb_idx(const Node* n) const { - return vloop_analyzer().body().bb_idx(n); + return _vloop_analyzer.body().bb_idx(n); } // VLoopTypes Accessors const Type* velt_type(Node* n) const { - return vloop_analyzer().types().velt_type(n); + return _vloop_analyzer.types().velt_type(n); } BasicType velt_basic_type(Node* n) const { - return vloop_analyzer().types().velt_basic_type(n); + return _vloop_analyzer.types().velt_basic_type(n); } bool same_velt_type(Node* n1, Node* n2) const { - return vloop_analyzer().types().same_velt_type(n1, n2); + return _vloop_analyzer.types().same_velt_type(n1, n2); } int data_size(Node* n) const { - return vloop_analyzer().types().data_size(n); + return _vloop_analyzer.types().data_size(n); } int vector_width(Node* n) const { - return vloop_analyzer().types().vector_width(n); + return _vloop_analyzer.types().vector_width(n); } int vector_width_in_bytes(const Node* n) const { - return vloop_analyzer().types().vector_width_in_bytes(n); + return _vloop_analyzer.types().vector_width_in_bytes(n); } #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord bool is_trace_superword_alignment() const { // Too verbose for TraceSuperWord - return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT); + return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT); } bool is_trace_superword_dependence_graph() const { return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH); } bool is_trace_superword_adjacent_memops() const { return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS); } bool is_trace_superword_rejections() const { return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS); } bool is_trace_superword_packset() const { return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET); } bool is_trace_superword_info() const { return TraceSuperWord || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO); } bool is_trace_superword_verbose() const { // Too verbose for TraceSuperWord - return vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE); + return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE); } bool is_trace_superword_any() const { return TraceSuperWord || is_trace_align_vector() || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) || - vloop().vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE); + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ALIGNMENT) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_DEPENDENCE_GRAPH) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_ADJACENT_MEMOPS) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_REJECTIONS) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_PACKSET) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_INFO) || + _vloop.vtrace().is_trace(TraceAutoVectorizationTag::SW_VERBOSE); } bool is_trace_align_vector() const { - return vloop().vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) || + return _vloop.vtrace().is_trace(TraceAutoVectorizationTag::ALIGN_VECTOR) || is_trace_superword_verbose(); } #endif diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 5f268bced090f..d6554670d16f6 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -115,17 +115,17 @@ VStatus VLoop::check_preconditions_helper() { // Return true iff all submodules are loaded successfully bool VLoopAnalyzer::setup_submodules() { #ifndef PRODUCT - if (vloop().is_trace_loop_analyzer()) { + if (_vloop.is_trace_loop_analyzer()) { tty->print_cr("\nVLoopAnalyzer::setup_submodules"); - vloop().lpt()->dump_head(); - vloop().cl()->dump(); + _vloop.lpt()->dump_head(); + _vloop.cl()->dump(); } #endif VStatus status = setup_submodules_helper(); if (!status.is_success()) { #ifndef PRODUCT - if (vloop().is_trace_loop_analyzer()) { + if (_vloop.is_trace_loop_analyzer()) { tty->print_cr("\nVLoopAnalyze::setup_submodules: failed: %s", status.failure_reason()); } #endif @@ -136,7 +136,7 @@ bool VLoopAnalyzer::setup_submodules() { VStatus VLoopAnalyzer::setup_submodules_helper() { // Skip any loop that has not been assigned max unroll by analysis. - if (SuperWordLoopUnrollAnalysis && vloop().cl()->slp_max_unroll() == 0) { + if (SuperWordLoopUnrollAnalysis && _vloop.cl()->slp_max_unroll() == 0) { return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_MAX_UNROLL); } @@ -284,7 +284,7 @@ bool VPointer::invariant(Node* n) const { // main loop (Illegal invariant happens when n_c is a CastII node that // prevents data nodes to flow above the main loop). Node* n_c = phase()->get_ctrl(n); - return phase()->is_dominator(n_c, vloop().pre_loop_head()); + return phase()->is_dominator(n_c, _vloop.pre_loop_head()); } } return is_not_member; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 827a9a3c26df5..a4cd3579e3828 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -130,27 +130,27 @@ class VLoop : public StackObj { const VTrace& vtrace() const { return _vtrace; } bool is_trace_preconditions() const { - return vtrace().is_trace(TraceAutoVectorizationTag::PRECONDITIONS); + return _vtrace.is_trace(TraceAutoVectorizationTag::PRECONDITIONS); } bool is_trace_loop_analyzer() const { - return vtrace().is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER); + return _vtrace.is_trace(TraceAutoVectorizationTag::LOOP_ANALYZER); } bool is_trace_memory_slices() const { - return vtrace().is_trace(TraceAutoVectorizationTag::MEMORY_SLICES); + return _vtrace.is_trace(TraceAutoVectorizationTag::MEMORY_SLICES); } bool is_trace_body() const { - return vtrace().is_trace(TraceAutoVectorizationTag::BODY); + return _vtrace.is_trace(TraceAutoVectorizationTag::BODY); } bool is_trace_vector_element_type() const { - return vtrace().is_trace(TraceAutoVectorizationTag::TYPES); + return _vtrace.is_trace(TraceAutoVectorizationTag::TYPES); } bool is_trace_pointer_analysis() const { - return vtrace().is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); + return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); } #endif @@ -221,7 +221,6 @@ class VLoopReductions : public StackObj { NONCOPYABLE(VLoopReductions); private: - const VLoop& vloop() const { return _vloop; } // Search for a path P = (n_1, n_2, ..., n_k) such that: // - original_input(n_i, input) = n_i+1 for all 1 <= i < k, // - path(n) for all n in P, @@ -295,8 +294,6 @@ class VLoopMemorySlices : public StackObj { GrowableArray _heads; GrowableArray _tails; - const VLoop& vloop() const { return _vloop; } - public: VLoopMemorySlices(Arena* arena, const VLoop& vloop) : _vloop(vloop), @@ -336,8 +333,6 @@ class VLoopBody : public StackObj { // Can be very large, and thus lives in VSharedData GrowableArray& _body_idx; - const VLoop& vloop() const { return _vloop; } - public: VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) : _vloop(vloop), @@ -384,9 +379,6 @@ class VLoopTypes : public StackObj { // bb_idx -> vector element type GrowableArray _velt_type; - const VLoop& vloop() const { return _vloop; } - const VLoopBody& body() const { return _body; } - public: VLoopTypes(Arena* arena, const VLoop& vloop, @@ -400,8 +392,8 @@ class VLoopTypes : public StackObj { NOT_PRODUCT( void print() const; ) const Type* velt_type(const Node* n) const { - assert(vloop().in_bb(n), "only call on nodes in loop"); - const Type* t = _velt_type.at(body().bb_idx(n)); + assert(_vloop.in_bb(n), "only call on nodes in loop"); + const Type* t = _velt_type.at(_body.bb_idx(n)); assert(t != nullptr, "must have type"); return t; } @@ -439,8 +431,8 @@ class VLoopTypes : public StackObj { private: void set_velt_type(Node* n, const Type* t) { assert(t != nullptr, "cannot set nullptr"); - assert(vloop().in_bb(n), "only call on nodes in loop"); - _velt_type.at_put(body().bb_idx(n), t); + assert(_vloop.in_bb(n), "only call on nodes in loop"); + _velt_type.at_put(_body.bb_idx(n), t); } // Smallest type containing range of values @@ -478,7 +470,7 @@ class VLoopAnalyzer : StackObj { _reductions (&_arena, vloop), _memory_slices (&_arena, vloop), _body (&_arena, vloop, vshared), - _types (&_arena, vloop, body()) + _types (&_arena, vloop, _body) { _success = setup_submodules(); } @@ -522,10 +514,9 @@ class VPointer : public ArenaObj { bool _analyze_only; // Used in loop unrolling only for vpointer trace uint _stack_idx; // Used in loop unrolling only for vpointer trace - const VLoop& vloop() const { return _vloop; } - PhaseIdealLoop* phase() const { return vloop().phase(); } - IdealLoopTree* lpt() const { return vloop().lpt(); } - PhiNode* iv() const { return vloop().iv(); } + PhaseIdealLoop* phase() const { return _vloop.phase(); } + IdealLoopTree* lpt() const { return _vloop.lpt(); } + PhiNode* iv() const { return _vloop.iv(); } bool is_loop_member(Node* n) const; bool invariant(Node* n) const; @@ -598,7 +589,7 @@ class VPointer : public ArenaObj { bool overlap_possible_with_any_in(Node_List* p) { for (uint k = 0; k < p->size(); k++) { MemNode* mem = p->at(k)->as_Mem(); - VPointer p_mem(mem, vloop()); + VPointer p_mem(mem, _vloop); // Only if we know that we have Less or Greater can we // be sure that there can never be an overlap between // the two memory regions. From 77695f059a3ce581ee84c69fafd3bccfc656d2fb Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 15 Feb 2024 23:53:32 +0100 Subject: [PATCH 12/13] indentation --- src/hotspot/share/opto/superword.hpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index ca3c4284f9ee6..00a8c915ac7fb 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -224,13 +224,13 @@ class SuperWord : public ResourceObj { static void unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor); // VLoop Accessors - PhaseIdealLoop* phase() const { return _vloop.phase(); } - PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } - IdealLoopTree* lpt() const { return _vloop.lpt(); } - CountedLoopNode* cl() const { return _vloop.cl(); } - PhiNode* iv() const { return _vloop.iv(); } - int iv_stride() const { return cl()->stride_con(); } - bool in_bb(const Node* n) const { return _vloop.in_bb(n); } + PhaseIdealLoop* phase() const { return _vloop.phase(); } + PhaseIterGVN& igvn() const { return _vloop.phase()->igvn(); } + IdealLoopTree* lpt() const { return _vloop.lpt(); } + CountedLoopNode* cl() const { return _vloop.cl(); } + PhiNode* iv() const { return _vloop.iv(); } + int iv_stride() const { return cl()->stride_con(); } + bool in_bb(const Node* n) const { return _vloop.in_bb(n); } // VLoopReductions Accessors bool is_marked_reduction(const Node* n) const { From 9c70b2d63135e3f26db0d64b76e75de1c570960c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 26 Feb 2024 10:09:23 +0100 Subject: [PATCH 13/13] review updates for Christian --- src/hotspot/share/opto/superword.cpp | 14 +++++++------- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 10 +++++----- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index e4cce5f67fa9e..75b5e53f2792d 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -777,7 +777,7 @@ void SuperWord::dependence_graph() { MemNode* tail = mem_slice_tail.at(i); // Get slice in predecessor order (last is first) - _vloop_analyzer.memory_slices().get_slice(head, tail, slice_nodes); + _vloop_analyzer.memory_slices().get_slice_in_reverse_order(head, tail, slice_nodes); // Make the slice dependent on the root DepMem* slice = _dg.dep(head); @@ -861,8 +861,8 @@ void VLoopMemorySlices::print() const { #endif // Get all memory nodes of a slice, in reverse order -void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArray &slice) const { - assert(slice.length() == 0, "start empty"); +void VLoopMemorySlices::get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray &slice) const { + assert(slice.is_empty(), "start empty"); Node* n = tail; Node* prev = nullptr; while (true) { @@ -897,7 +897,7 @@ void VLoopMemorySlices::get_slice(PhiNode* head, MemNode* tail, GrowableArrayprint_cr("\nVLoopMemorySlices::get_slice:"); + tty->print_cr("\nVLoopMemorySlices::get_slice_in_reverse_order:"); head->dump(); for (int j = slice.length() - 1; j >= 0 ; j--) { slice.at(j)->dump(); @@ -2274,7 +2274,7 @@ void SuperWord::schedule_reorder_memops(Node_List &memops_schedule) { // loop we may have a different last store, and we need to adjust the uses accordingly. GrowableArray old_last_store_in_slice(max_slices, max_slices, nullptr); - const GrowableArray &mem_slice_head = _vloop_analyzer.memory_slices().heads(); + const GrowableArray& mem_slice_head = _vloop_analyzer.memory_slices().heads(); // (1) Set up the initial memory state from Phi. And find the old last store. for (int i = 0; i < mem_slice_head.length(); i++) { @@ -2931,7 +2931,7 @@ bool SuperWord::is_vector_use(Node* use, int u_idx) { // Return nullptr if success, else failure message VStatus VLoopBody::construct() { - assert(_body.length() == 0, "body is empty"); + assert(_body.is_empty(), "body is empty"); // First pass over loop body: // (1) Check that there are no unwanted nodes (LoadStore, MergeMem, data Proj). @@ -3202,7 +3202,7 @@ void VLoopTypes::compute_vector_element_type() { } if (nn->is_Cmp() && nn->in(0) == nullptr) { assert(_vloop.in_bb(nn->in(1)) || _vloop.in_bb(nn->in(2)), - "one of the inputs must be in the loop too"); + "one of the inputs must be in the loop, too"); if (_vloop.in_bb(nn->in(1))) { set_velt_type(n, velt_type(nn->in(1))); } else { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index d6554670d16f6..d8d4e03210635 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -149,7 +149,7 @@ VStatus VLoopAnalyzer::setup_submodules_helper() { // If there is no memory slice detected, it means there is no store. // If there is no reduction and no store, then we give up, because // vectorization is not possible anyway (given current limitations). - if (!reductions().is_marked_reduction_loop() && + if (!_reductions.is_marked_reduction_loop() && _memory_slices.heads().is_empty()) { return VStatus::make_failure(VLoopAnalyzer::FAILURE_NO_REDUCTION_OR_STORE); } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index a4cd3579e3828..3f897010d9db1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -303,11 +303,11 @@ class VLoopMemorySlices : public StackObj { void find_memory_slices(); - const GrowableArray &heads() const { return _heads; } - const GrowableArray &tails() const { return _tails; } + const GrowableArray& heads() const { return _heads; } + const GrowableArray& tails() const { return _tails; } // Get all memory nodes of a slice, in reverse order - void get_slice(PhiNode* head, MemNode* tail, GrowableArray &slice) const; + void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray& slice) const; bool same_memory_slice(MemNode* m1, MemNode* m2) const; @@ -331,7 +331,7 @@ class VLoopBody : public StackObj { // Mapping node->_idx -> body_idx // Can be very large, and thus lives in VSharedData - GrowableArray& _body_idx; + GrowableArray& _body_idx; public: VLoopBody(Arena* arena, const VLoop& vloop, VSharedData& vshared) : @@ -463,7 +463,7 @@ class VLoopAnalyzer : StackObj { VLoopTypes _types; public: - VLoopAnalyzer(const VLoop& vloop, VSharedData &vshared) : + VLoopAnalyzer(const VLoop& vloop, VSharedData& vshared) : _vloop(vloop), _arena(mtCompiler), _success(false),