From 4a46cd08e56e115f620d6bfe0df02724d2e3731d Mon Sep 17 00:00:00 2001 From: Pengfei Li Date: Tue, 11 Jul 2023 09:53:09 +0000 Subject: [PATCH] 8311691: C2: Remove legacy code related to PostLoopMultiversioning As discussed in JDK-8308994, we are working on re-implementation of post loop vectorization and planning to refactor current SuperWord code. As nobody is using or maintaining the old implementation now, to make the refactoring work easier, we propose to remove the legacy code of the old implementation first. This patch removes all code realted to `PostLoopMultiversioning` inside and outside SuperWord. After the removal, `SLP_extract()` in SuperWord should only work on main loops. So we also removed all `is_main_loop()` checks inside and added assertions instead. Tested with hotspot::hotspot_all_no_apps, jdk tier1~3, langtools tier1 and 100k fuzzer tests on x86 and AArch64, no issue is found. --- .../share/compiler/compilerDefinitions.cpp | 9 - src/hotspot/share/opto/c2_globals.hpp | 3 - src/hotspot/share/opto/loopTransform.cpp | 194 ---------- src/hotspot/share/opto/loopnode.cpp | 25 +- src/hotspot/share/opto/loopnode.hpp | 32 +- src/hotspot/share/opto/superword.cpp | 333 ++---------------- src/hotspot/share/opto/superword.hpp | 5 - .../TestRangeCheckEliminationDisabled.java | 41 --- .../runner/VectorizationTestRunner.java | 3 - 9 files changed, 44 insertions(+), 601 deletions(-) delete mode 100644 test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java diff --git a/src/hotspot/share/compiler/compilerDefinitions.cpp b/src/hotspot/share/compiler/compilerDefinitions.cpp index 23af57f3910fd..7b0f8b3e2a9d2 100644 --- a/src/hotspot/share/compiler/compilerDefinitions.cpp +++ b/src/hotspot/share/compiler/compilerDefinitions.cpp @@ -512,15 +512,6 @@ bool CompilerConfig::check_args_consistency(bool status) { FLAG_SET_CMDLINE(BackgroundCompilation, false); } -#ifdef COMPILER2 - if (PostLoopMultiversioning && !RangeCheckElimination) { - if (!FLAG_IS_DEFAULT(PostLoopMultiversioning)) { - warning("PostLoopMultiversioning disabled because RangeCheckElimination is disabled."); - } - FLAG_SET_CMDLINE(PostLoopMultiversioning, false); - } -#endif // COMPILER2 - if (CompilerConfig::is_interpreter_only()) { if (UseCompiler) { if (!FLAG_IS_DEFAULT(UseCompiler)) { diff --git a/src/hotspot/share/opto/c2_globals.hpp b/src/hotspot/share/opto/c2_globals.hpp index 10b8ac2028aef..50dd3b300f4cd 100644 --- a/src/hotspot/share/opto/c2_globals.hpp +++ b/src/hotspot/share/opto/c2_globals.hpp @@ -182,9 +182,6 @@ "Map number of unrolls for main loop via " \ "Superword Level Parallelism analysis") \ \ - product(bool, PostLoopMultiversioning, false, EXPERIMENTAL, \ - "Multi versioned post loops to eliminate range checks") \ - \ notproduct(bool, TraceSuperWordLoopUnrollAnalysis, false, \ "Trace what Superword Level Parallelism analysis applies") \ \ diff --git a/src/hotspot/share/opto/loopTransform.cpp b/src/hotspot/share/opto/loopTransform.cpp index 7c3f5841f3f51..6e454a5cede96 100644 --- a/src/hotspot/share/opto/loopTransform.cpp +++ b/src/hotspot/share/opto/loopTransform.cpp @@ -1888,55 +1888,6 @@ void PhaseIdealLoop::insert_vector_post_loop(IdealLoopTree *loop, Node_List &old loop->record_for_igvn(); } - -//-------------------------insert_scalar_rced_post_loop------------------------ -// Insert a copy of the rce'd main loop as a post loop, -// We have not unrolled the main loop, so this is the right time to inject this. -// Later we will examine the partner of this post loop pair which still has range checks -// to see inject code which tests at runtime if the range checks are applicable. -void PhaseIdealLoop::insert_scalar_rced_post_loop(IdealLoopTree *loop, Node_List &old_new) { - if (!loop->_head->is_CountedLoop()) return; - - CountedLoopNode *cl = loop->_head->as_CountedLoop(); - - // only process RCE'd main loops - if (!cl->is_main_loop() || loop->range_checks_present()) return; - -#ifndef PRODUCT - if (TraceLoopOpts) { - tty->print("PostScalarRce "); - loop->dump_head(); - } -#endif - C->set_major_progress(); - - // Find common pieces of the loop being guarded with pre & post loops - CountedLoopNode *main_head = loop->_head->as_CountedLoop(); - CountedLoopEndNode *main_end = main_head->loopexit(); - // diagnostic to show loop end is not properly formed - assert(main_end->outcnt() == 2, "1 true, 1 false path only"); - - Node *incr = main_end->incr(); - Node *limit = main_end->limit(); - - // In this case we throw away the result as we are not using it to connect anything else. - CountedLoopNode *post_head = nullptr; - insert_post_loop(loop, old_new, main_head, main_end, incr, limit, post_head); - copy_assertion_predicates_to_post_loop(main_head->skip_strip_mined(), post_head, incr, main_head->stride()); - - // It's difficult to be precise about the trip-counts - // for post loops. They are usually very short, - // so guess that unit vector trips is a reasonable value. - post_head->set_profile_trip_cnt(4.0); - post_head->set_is_rce_post_loop(); - - // Now force out all loop-invariant dominating tests. The optimizer - // finds some, but we _know_ they are all useless. - peeled_dom_test_elim(loop, old_new); - loop->record_for_igvn(); -} - - //------------------------------insert_post_loop------------------------------- // Insert post loops. Add a post loop to the given loop passed. Node *PhaseIdealLoop::insert_post_loop(IdealLoopTree* loop, Node_List& old_new, @@ -3198,143 +3149,6 @@ bool IdealLoopTree::compute_has_range_checks() const { return false; } -//-------------------------multi_version_post_loops---------------------------- -// Check the range checks that remain, if simple, use the bounds to guard -// which version to a post loop we execute, one with range checks or one without -bool PhaseIdealLoop::multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop) { - bool multi_version_succeeded = false; - assert(RangeCheckElimination, ""); - CountedLoopNode *legacy_cl = legacy_loop->_head->as_CountedLoop(); - assert(legacy_cl->is_post_loop(), ""); - - // Check for existence of range checks using the unique instance to make a guard with - Unique_Node_List worklist; - for (uint i = 0; i < legacy_loop->_body.size(); i++) { - Node *iff = legacy_loop->_body[i]; - int iff_opc = iff->Opcode(); - if (iff_opc == Op_If || iff_opc == Op_RangeCheck) { - worklist.push(iff); - } - } - - // Find RCE'd post loop so that we can stage its guard. - if (legacy_cl->is_canonical_loop_entry() == nullptr) { - return multi_version_succeeded; - } - Node* ctrl = legacy_cl->in(LoopNode::EntryControl); - Node* iffm = ctrl->in(0); - - // Now we test that both the post loops are connected - Node* post_loop_region = iffm->in(0); - if (post_loop_region == nullptr) return multi_version_succeeded; - if (!post_loop_region->is_Region()) return multi_version_succeeded; - Node* covering_region = post_loop_region->in(RegionNode::Control+1); - if (covering_region == nullptr) return multi_version_succeeded; - if (!covering_region->is_Region()) return multi_version_succeeded; - Node* p_f = covering_region->in(RegionNode::Control); - if (p_f == nullptr) return multi_version_succeeded; - if (!p_f->is_IfFalse()) return multi_version_succeeded; - if (!p_f->in(0)->is_CountedLoopEnd()) return multi_version_succeeded; - CountedLoopEndNode* rce_loop_end = p_f->in(0)->as_CountedLoopEnd(); - if (rce_loop_end == nullptr) return multi_version_succeeded; - CountedLoopNode* rce_cl = rce_loop_end->loopnode(); - if (rce_cl == nullptr || !rce_cl->is_post_loop()) return multi_version_succeeded; - CountedLoopNode *known_rce_cl = rce_loop->_head->as_CountedLoop(); - if (rce_cl != known_rce_cl) return multi_version_succeeded; - - // Then we fetch the cover entry test - ctrl = rce_cl->in(LoopNode::EntryControl); - if (!ctrl->is_IfTrue() && !ctrl->is_IfFalse()) return multi_version_succeeded; - -#ifndef PRODUCT - if (TraceLoopOpts) { - tty->print("PostMultiVersion\n"); - rce_loop->dump_head(); - legacy_loop->dump_head(); - } -#endif - - // Now fetch the limit we want to compare against - Node *limit = rce_cl->limit(); - bool first_time = true; - - // If we got this far, we identified the post loop which has been RCE'd and - // we have a work list. Now we will try to transform the if guard to cause - // the loop pair to be multi version executed with the determination left to runtime - // or the optimizer if full information is known about the given arrays at compile time. - Node *last_min = nullptr; - multi_version_succeeded = true; - while (worklist.size()) { - Node* rc_iffm = worklist.pop(); - if (rc_iffm->is_If()) { - Node *rc_bolzm = rc_iffm->in(1); - if (rc_bolzm->is_Bool()) { - Node *rc_cmpzm = rc_bolzm->in(1); - if (rc_cmpzm->is_Cmp()) { - Node *rc_left = rc_cmpzm->in(2); - if (rc_left->Opcode() != Op_LoadRange) { - multi_version_succeeded = false; - break; - } - if (first_time) { - last_min = rc_left; - first_time = false; - } else { - Node *cur_min = new MinINode(last_min, rc_left); - last_min = cur_min; - _igvn.register_new_node_with_optimizer(last_min); - } - } - } - } - } - - // All we have to do is update the limit of the rce loop - // with the min of our expression and the current limit. - // We will use this expression to replace the current limit. - if (last_min && multi_version_succeeded) { - Node *cur_min = new MinINode(last_min, limit); - _igvn.register_new_node_with_optimizer(cur_min); - Node *cmp_node = rce_loop_end->cmp_node(); - _igvn.replace_input_of(cmp_node, 2, cur_min); - set_ctrl(cur_min, ctrl); - set_loop(cur_min, rce_loop->_parent); - - legacy_cl->mark_is_multiversioned(); - rce_cl->mark_is_multiversioned(); - multi_version_succeeded = true; - - C->set_major_progress(); - } - - return multi_version_succeeded; -} - -//-------------------------poison_rce_post_loop-------------------------------- -// Causes the rce'd post loop to be optimized away if multiversioning fails -void PhaseIdealLoop::poison_rce_post_loop(IdealLoopTree *rce_loop) { - CountedLoopNode *rce_cl = rce_loop->_head->as_CountedLoop(); - Node* ctrl = rce_cl->in(LoopNode::EntryControl); - if (ctrl->is_IfTrue() || ctrl->is_IfFalse()) { - Node* iffm = ctrl->in(0); - if (iffm->is_If()) { - Node* cur_bool = iffm->in(1); - if (cur_bool->is_Bool()) { - Node* cur_cmp = cur_bool->in(1); - if (cur_cmp->is_Cmp()) { - BoolTest::mask new_test = BoolTest::gt; - BoolNode *new_bool = new BoolNode(cur_cmp, new_test); - _igvn.replace_node(cur_bool, new_bool); - _igvn._worklist.push(new_bool); - Node* left_op = cur_cmp->in(1); - _igvn.replace_input_of(cur_cmp, 2, left_op); - C->set_major_progress(); - } - } - } - } -} - //------------------------------DCE_loop_body---------------------------------- // Remove simplistic dead code from loop body void IdealLoopTree::DCE_loop_body() { @@ -3864,14 +3678,6 @@ bool IdealLoopTree::iteration_split_impl(PhaseIdealLoop *phase, Node_List &old_n phase->do_range_check(this, old_new); } - if (should_unroll && !should_peel && PostLoopMultiversioning && - Matcher::has_predicated_vectors()) { - // Try to setup multiversioning on main loops before they are unrolled - if (cl->is_main_loop() && (cl->unrolled_count() == 1)) { - phase->insert_scalar_rced_post_loop(this, old_new); - } - } - // Double loop body for unrolling. Adjust the minimum-trip test (will do // twice as many iterations as before) and the main body limit (only do // an even number of trips). If we are peeling, we might enable some RCE diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index 9148466f8ac2d..7551b0e246621 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -4013,7 +4013,6 @@ void IdealLoopTree::dump_head() { if (cl->is_post_loop()) tty->print(" post"); if (cl->is_vectorized_loop()) tty->print(" vector"); if (range_checks_present()) tty->print(" rc "); - if (cl->is_multiversioned()) tty->print(" multi "); } if (_has_call) tty->print(" has_call"); if (_has_sfpt) tty->print(" has_sfpt"); @@ -4653,29 +4652,7 @@ void PhaseIdealLoop::build_and_optimize() { IdealLoopTree* lpt = iter.current(); if (lpt->is_counted()) { CountedLoopNode *cl = lpt->_head->as_CountedLoop(); - - if (cl->is_rce_post_loop() && !cl->is_vectorized_loop()) { - assert(PostLoopMultiversioning, "multiversioning must be enabled"); - // Check that the rce'd post loop is encountered first, multiversion after all - // major main loop optimization are concluded - if (!C->major_progress()) { - IdealLoopTree *lpt_next = lpt->_next; - if (lpt_next && lpt_next->is_counted()) { - CountedLoopNode *cl = lpt_next->_head->as_CountedLoop(); - if (cl->is_post_loop() && lpt_next->range_checks_present()) { - if (!cl->is_multiversioned()) { - if (multi_version_post_loops(lpt, lpt_next) == false) { - // Cause the rce loop to be optimized away if we fail - cl->mark_is_multiversioned(); - cl->set_slp_max_unroll(0); - poison_rce_post_loop(lpt); - } - } - } - } - sw.transform_loop(lpt, true); - } - } else if (cl->is_main_loop()) { + if (cl->is_main_loop()) { if (!sw.transform_loop(lpt, true)) { // Instigate more unrolling for optimization when vectorization fails. if (cl->has_passed_slp()) { diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 70c403d30a5f8..57cb93ff1f9e0 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -72,16 +72,13 @@ class LoopNode : public RegionNode { DoUnrollOnly = 1<<9, VectorizedLoop = 1<<10, HasAtomicPostLoop = 1<<11, - IsMultiversioned = 1<<12, - StripMined = 1<<13, - SubwordLoop = 1<<14, - ProfileTripFailed = 1<<15, - LoopNestInnerLoop = 1<<16, - LoopNestLongOuterLoop = 1<<17}; + StripMined = 1<<12, + SubwordLoop = 1<<13, + ProfileTripFailed = 1<<14, + LoopNestInnerLoop = 1<<15, + LoopNestLongOuterLoop = 1<<16 }; char _unswitch_count; enum { _unswitch_max=3 }; - char _postloop_flags; - enum { RCEPostLoop = 1 }; // Expected trip count from profile data float _profile_trip_cnt; @@ -93,7 +90,6 @@ class LoopNode : public RegionNode { bool is_inner_loop() const { return _loop_flags & InnerLoop; } void set_inner_loop() { _loop_flags |= InnerLoop; } - bool is_multiversioned() const { return _loop_flags & IsMultiversioned; } bool is_vectorized_loop() const { return _loop_flags & VectorizedLoop; } bool is_partial_peel_loop() const { return _loop_flags & PartialPeelLoop; } void set_partial_peel_loop() { _loop_flags |= PartialPeelLoop; } @@ -110,7 +106,6 @@ class LoopNode : public RegionNode { void mark_do_unroll_only() { _loop_flags |= DoUnrollOnly; } void mark_loop_vectorized() { _loop_flags |= VectorizedLoop; } void mark_has_atomic_post_loop() { _loop_flags |= HasAtomicPostLoop; } - void mark_is_multiversioned() { _loop_flags |= IsMultiversioned; } void mark_strip_mined() { _loop_flags |= StripMined; } void clear_strip_mined() { _loop_flags &= ~StripMined; } void mark_profile_trip_failed() { _loop_flags |= ProfileTripFailed; } @@ -121,9 +116,6 @@ class LoopNode : public RegionNode { int unswitch_max() { return _unswitch_max; } int unswitch_count() { return _unswitch_count; } - int is_rce_post_loop() const { return _postloop_flags & RCEPostLoop; } - void set_is_rce_post_loop() { _postloop_flags |= RCEPostLoop; } - void set_unswitch_count(int val) { assert (val <= unswitch_max(), "too many unswitches"); _unswitch_count = val; @@ -134,7 +126,7 @@ class LoopNode : public RegionNode { LoopNode(Node *entry, Node *backedge) : RegionNode(3), _loop_flags(0), _unswitch_count(0), - _postloop_flags(0), _profile_trip_cnt(COUNT_UNKNOWN) { + _profile_trip_cnt(COUNT_UNKNOWN) { init_class_id(Class_Loop); init_req(EntryControl, entry); init_req(LoopBackControl, backedge); @@ -322,8 +314,6 @@ class CountedLoopNode : public BaseCountedLoopNode { int node_count_before_unroll() { return _node_count_before_unroll; } void set_slp_max_unroll(int unroll_factor) { _slp_maximum_unroll_factor = unroll_factor; } int slp_max_unroll() const { return _slp_maximum_unroll_factor; } - void set_slp_pack_count(int pack_count) { _slp_vector_pack_count = pack_count; } - int slp_pack_count() const { return _slp_vector_pack_count; } virtual LoopNode* skip_strip_mined(int expect_skeleton = 1); OuterStripMinedLoopNode* outer_loop() const; @@ -1305,9 +1295,6 @@ class PhaseIdealLoop : public PhaseTransform { CountedLoopNode* main_head, CountedLoopEndNode* main_end, Node*& incr, Node* limit, CountedLoopNode*& post_head); - // Add an RCE'd post loop which we will multi-version adapt for run time test path usage - void insert_scalar_rced_post_loop( IdealLoopTree *loop, Node_List &old_new ); - // Add a vector post loop between a vector main loop and the current post loop void insert_vector_post_loop(IdealLoopTree *loop, Node_List &old_new); // If Node n lives in the back_ctrl block, we clone a private version of n @@ -1402,13 +1389,6 @@ class PhaseIdealLoop : public PhaseTransform { // Eliminate range-checks and other trip-counter vs loop-invariant tests. void do_range_check(IdealLoopTree *loop, Node_List &old_new); - // Process post loops which have range checks and try to build a multi-version - // guard to safely determine if we can execute the post loop which was RCE'd. - bool multi_version_post_loops(IdealLoopTree *rce_loop, IdealLoopTree *legacy_loop); - - // Cause the rce'd post loop to optimized away, this happens if we cannot complete multiverioning - void poison_rce_post_loop(IdealLoopTree *rce_loop); - // Create a slow version of the loop by cloning the loop // and inserting an if to select fast-slow versions. // Return the inserted if. diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f665089d0bf75..548e7bc332052 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -54,7 +54,6 @@ SuperWord::SuperWord(PhaseIdealLoop* phase) : _packset(arena(), 8, 0, nullptr), // packs for the current block _bb_idx(arena(), (int)(1.10 * phase->C->unique()), 0, 0), // node idx to index in bb _block(arena(), 8, 0, nullptr), // nodes in current block - _post_block(arena(), 8, 0, nullptr), // nodes common to current block which are marked as post loop vectorizable _data_entry(arena(), 8, 0, nullptr), // nodes with all inputs from outside _mem_slice_head(arena(), 8, 0, nullptr), // memory slice heads _mem_slice_tail(arena(), 8, 0, nullptr), // memory slice tails @@ -116,11 +115,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { mark_reductions(); } - if (cl->is_rce_post_loop() && is_marked_reduction_loop()) { - // Post loop vectorization doesn't support reductions - return false; - } - // skip any loop that has not been assigned max unroll by analysis if (do_optimization) { if (SuperWordLoopUnrollAnalysis && cl->slp_max_unroll() == 0) { @@ -176,24 +170,6 @@ bool SuperWord::transform_loop(IdealLoopTree* lpt, bool do_optimization) { if (do_optimization) { assert(_packset.length() == 0, "packset must be empty"); success = SLP_extract(); - if (PostLoopMultiversioning) { - if (cl->is_vectorized_loop() && cl->is_main_loop() && !is_marked_reduction_loop()) { - IdealLoopTree *lpt_next = cl->is_strip_mined() ? lpt->_parent->_next : lpt->_next; - CountedLoopNode *cl_next = lpt_next->_head->as_CountedLoop(); - // Main loop SLP works well for manually unrolled loops. But post loop - // vectorization doesn't work for these. To bail out the optimization - // earlier, we have range check and loop stride conditions below. - if (cl_next->is_post_loop() && !lpt_next->range_checks_present() && - cl_next->stride_is_con() && abs(cl_next->stride_con()) == 1) { - if (!cl_next->is_vectorized_loop()) { - // Propagate some main loop attributes to its corresponding scalar - // rce'd post loop for vectorization with vector masks - cl_next->set_slp_max_unroll(cl->slp_max_unroll()); - cl_next->set_slp_pack_count(cl->slp_pack_count()); - } - } - } - } } return success; } @@ -206,9 +182,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { Node_Stack nstack((int)ignored_size); CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); Node *cl_exit = cl->loopexit_or_null(); - int rpo_idx = _post_block.length(); - - assert(rpo_idx == 0, "post loop block is empty"); // First clear the entries for (uint i = 0; i < lpt()->_body.size(); i++) { @@ -313,27 +286,6 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { } if (is_slp) { - // In the main loop, SLP works well if parts of the operations in the loop body - // are not vectorizable and those non-vectorizable parts will be unrolled only. - // But in post loops with vector masks, we create singleton packs directly from - // scalars so all operations should be vectorized together. This compares the - // number of packs in the post loop with the main loop and bail out if the post - // loop potentially has more packs. - if (cl->is_rce_post_loop()) { - for (uint i = 0; i < lpt()->_body.size(); i++) { - if (ignored_loop_nodes[i] == -1) { - _post_block.at_put_grow(rpo_idx++, lpt()->_body.at(i)); - } - } - if (_post_block.length() > cl->slp_pack_count()) { - // Clear local_loop_unroll_factor and bail out directly from here - local_loop_unroll_factor = 0; - cl->mark_was_slp(); - cl->set_slp_max_unroll(0); - return; - } - } - // Now we try to find the maximum supported consistent vector which the machine // description can use bool flag_small_bt = false; @@ -404,7 +356,7 @@ void SuperWord::unrolling_analysis(int &local_loop_unroll_factor) { cl->mark_passed_slp(); } cl->mark_was_slp(); - if (cl->is_main_loop() || cl->is_rce_post_loop()) { + if (cl->is_main_loop()) { cl->set_slp_max_unroll(local_loop_unroll_factor); } } @@ -590,79 +542,42 @@ bool SuperWord::SLP_extract() { } } #endif + + CountedLoopNode* cl = lpt()->_head->as_CountedLoop(); + assert(cl->is_main_loop(), "SLP should only work on main loops"); + // Ready the block if (!construct_bb()) { return false; // Exit if no interesting nodes or complex graph. } - // build _dg, _disjoint_ptrs + // build _dg, _disjoint_ptrs dependence_graph(); // compute function depth(Node*) compute_max_depth(); - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - if (cl->is_main_loop()) { - compute_vector_element_type(); - - // Attempt vectorization + // Compute vector element types + compute_vector_element_type(); - find_adjacent_refs(); - - if (align_to_ref() == nullptr) { - return false; // Did not find memory reference to align vectors - } + // Attempt vectorization + find_adjacent_refs(); - extend_packlist(); - - combine_packs(); + if (align_to_ref() == nullptr) { + return false; // Did not find memory reference to align vectors + } - construct_my_pack_map(); + extend_packlist(); - filter_packs(); + combine_packs(); - DEBUG_ONLY(verify_packs();) + construct_my_pack_map(); - schedule(); + filter_packs(); - // Record eventual count of vector packs for checks in post loop vectorization - if (PostLoopMultiversioning) { - cl->set_slp_pack_count(_packset.length()); - } - } else { - assert(cl->is_rce_post_loop(), "Must be an rce'd post loop"); - int saved_mapped_unroll_factor = cl->slp_max_unroll(); - if (saved_mapped_unroll_factor) { - int vector_mapped_unroll_factor = saved_mapped_unroll_factor; - - // now reset the slp_unroll_factor so that we can check the analysis mapped - // what the vector loop was mapped to - cl->set_slp_max_unroll(0); - - // do the analysis on the post loop - unrolling_analysis(vector_mapped_unroll_factor); - - // if our analyzed loop is a canonical fit, start processing it - if (vector_mapped_unroll_factor == saved_mapped_unroll_factor) { - // now add the vector nodes to packsets - for (int i = 0; i < _post_block.length(); i++) { - Node* n = _post_block.at(i); - Node_List* singleton = new Node_List(); - singleton->push(n); - _packset.append(singleton); - set_my_pack(n, singleton); - } + DEBUG_ONLY(verify_packs();) - // map base types for vector usage - compute_vector_element_type(); - } else { - return false; - } - } else { - // for some reason we could not map the slp analysis state of the vectorized loop - return false; - } - } + schedule(); return output(); } @@ -1143,6 +1058,8 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) { // A.out()->DependNode.in(1) and DependNode.out()->B.prec(x) void SuperWord::dependence_graph() { CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + assert(cl->is_main_loop(), "SLP should only work on main loops"); + // First, assign a dependence node to each memory node for (int i = 0; i < _block.length(); i++ ) { Node *n = _block.at(i); @@ -1157,9 +1074,7 @@ void SuperWord::dependence_graph() { Node* n_tail = _mem_slice_tail.at(i); // Get slice in predecessor order (last is first) - if (cl->is_main_loop()) { - mem_slice_preds(n_tail, n, _nlist); - } + mem_slice_preds(n_tail, n, _nlist); #ifndef PRODUCT if(TraceSuperWord && Verbose) { @@ -2591,6 +2506,7 @@ void SuperWord::print_loop(bool whole) { // Convert packs into vector node operations bool SuperWord::output() { CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + assert(cl->is_main_loop(), "SLP should only work on main loops"); Compile* C = _phase->C; if (_packset.length() == 0) { return false; @@ -2603,16 +2519,13 @@ bool SuperWord::output() { } #endif - if (cl->is_main_loop()) { - // MUST ENSURE main loop's initial value is properly aligned: - // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 - - align_initial_loop_index(align_to_ref()); + // Ensure main loop's initial value is properly aligned + // (iv_initial_value + min_iv_offset) % vector_width_in_bytes() == 0 + align_initial_loop_index(align_to_ref()); - // Insert extract (unpack) operations for scalar uses - for (int i = 0; i < _packset.length(); i++) { - insert_extracts(_packset.at(i)); - } + // Insert extract (unpack) operations for scalar uses + for (int i = 0; i < _packset.length(); i++) { + insert_extracts(_packset.at(i)); } uint max_vlen_in_bytes = 0; @@ -2629,16 +2542,6 @@ bool SuperWord::output() { return false; } - Node* vmask = nullptr; - if (cl->is_rce_post_loop() && do_reserve_copy()) { - // Create a vector mask node for post loop, bail out if not created - vmask = create_post_loop_vmask(); - if (vmask == nullptr) { - // create_post_loop_vmask checks many conditions, any of them could fail - return false; // and reverse to backup IG - } - } - for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); Node_List* p = my_pack(n); @@ -2650,10 +2553,6 @@ bool SuperWord::output() { uint vlen = p->size(); uint vlen_in_bytes = 0; Node* vn = nullptr; - if (cl->is_rce_post_loop()) { - // override vlen with the main loops vector length - vlen = cl->slp_max_unroll(); - } NOT_PRODUCT(if(is_trace_cmov()) {tty->print_cr("SWPointer::output: %d executed first, %d executed last in pack", first->_idx, n->_idx); print_pack(p);}) int opc = n->Opcode(); if (n->is_Load()) { @@ -2675,13 +2574,7 @@ bool SuperWord::output() { } Node* adr = first->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - if (cl->is_rce_post_loop()) { - assert(vmask != nullptr, "vector mask should be generated"); - const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen); - vn = new LoadVectorMaskedNode(ctl, mem, adr, atyp, vt, vmask); - } else { - vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p)); - } + vn = LoadVectorNode::make(opc, ctl, mem, adr, atyp, vlen, velt_basic_type(n), control_dependency(p)); vlen_in_bytes = vn->as_LoadVector()->memory_size(); } else if (n->is_Store()) { // Promote value to be stored to vector @@ -2699,13 +2592,7 @@ bool SuperWord::output() { Node* mem = first->in(MemNode::Memory); Node* adr = first->in(MemNode::Address); const TypePtr* atyp = n->adr_type(); - if (cl->is_rce_post_loop()) { - assert(vmask != nullptr, "vector mask should be generated"); - const TypeVect* vt = TypeVect::make(velt_basic_type(n), vlen); - vn = new StoreVectorMaskedNode(ctl, mem, adr, val, atyp, vmask); - } else { - vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen); - } + vn = StoreVectorNode::make(opc, ctl, mem, adr, atyp, val, vlen); vlen_in_bytes = vn->as_StoreVector()->memory_size(); } else if (VectorNode::is_scalar_rotate(n)) { Node* in1 = first->in(1); @@ -2961,20 +2848,12 @@ bool SuperWord::output() { if (TraceSuperWordLoopUnrollAnalysis) { tty->print_cr("vector loop(unroll=%d, len=%d)\n", max_vlen, max_vlen_in_bytes*BitsPerByte); } - // For atomic unrolled loops which are vector mapped, instigate more unrolling cl->set_notpassed_slp(); - if (cl->is_main_loop()) { - // if vector resources are limited, do not allow additional unrolling, also - // do not unroll more on pure vector loops which were not reduced so that we can - // program the post loop to single iteration execution. - if (Matcher::float_pressure_limit() > 8) { - C->set_major_progress(); - cl->mark_do_unroll_only(); - } - } - if (cl->is_rce_post_loop() && do_reserve_copy()) { - cl->mark_is_multiversioned(); + // if vector resources are limited, do not allow additional unrolling + if (Matcher::float_pressure_limit() > 8) { + C->set_major_progress(); + cl->mark_do_unroll_only(); } } } @@ -2988,107 +2867,6 @@ bool SuperWord::output() { return true; } -//-------------------------create_post_loop_vmask------------------------- -// Check the post loop vectorizability and create a vector mask if yes. -// Return null to bail out if post loop is not vectorizable. -Node* SuperWord::create_post_loop_vmask() { - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - assert(cl->is_rce_post_loop(), "Must be an rce post loop"); - assert(!is_marked_reduction_loop(), "no vector reduction in post loop"); - assert(abs(cl->stride_con()) == 1, "post loop stride can only be +/-1"); - - // Collect vector element types of all post loop packs. Also collect - // superword pointers of each memory access operation if the address - // expression is supported. (Note that vectorizable post loop should - // only have positive scale in counting-up loop and negative scale in - // counting-down loop.) Collected SWPointer(s) are also used for data - // dependence check next. - VectorElementSizeStats stats(_arena); - GrowableArray swptrs(_arena, _packset.length(), 0, nullptr); - for (int i = 0; i < _packset.length(); i++) { - Node_List* p = _packset.at(i); - assert(p->size() == 1, "all post loop packs should be singleton"); - Node* n = p->at(0); - BasicType bt = velt_basic_type(n); - if (!is_java_primitive(bt)) { - return nullptr; - } - if (n->is_Mem()) { - SWPointer* mem_p = new (_arena) SWPointer(n->as_Mem(), this, nullptr, false); - // For each memory access, we check if the scale (in bytes) in its - // address expression is equal to the data size times loop stride. - // With this, Only positive scales exist in counting-up loops and - // negative scales exist in counting-down loops. - if (mem_p->scale_in_bytes() != type2aelembytes(bt) * cl->stride_con()) { - return nullptr; - } - swptrs.append(mem_p); - } - stats.record_size(type2aelembytes(bt)); - } - - // Find the vector data type for generating vector masks. Currently we - // don't support post loops with mixed vector data sizes - int unique_size = stats.unique_size(); - BasicType vmask_bt; - switch (unique_size) { - case 1: vmask_bt = T_BYTE; break; - case 2: vmask_bt = T_SHORT; break; - case 4: vmask_bt = T_INT; break; - case 8: vmask_bt = T_LONG; break; - default: return nullptr; - } - - // Currently we can't remove this MaxVectorSize constraint. Without it, - // it's not guaranteed that the RCE'd post loop runs at most "vlen - 1" - // iterations, because the vector drain loop may not be cloned from the - // vectorized main loop. We should re-engineer PostLoopMultiversioning - // to fix this problem. - int vlen = cl->slp_max_unroll(); - if (unique_size * vlen != MaxVectorSize) { - return nullptr; - } - - // Bail out if target doesn't support mask generator or masked load/store - if (!Matcher::match_rule_supported_vector(Op_LoadVectorMasked, vlen, vmask_bt) || - !Matcher::match_rule_supported_vector(Op_StoreVectorMasked, vlen, vmask_bt) || - !Matcher::match_rule_supported_vector(Op_VectorMaskGen, vlen, vmask_bt)) { - return nullptr; - } - - // Bail out if potential data dependence exists between memory accesses - if (SWPointer::has_potential_dependence(swptrs)) { - return nullptr; - } - - // Create vector mask with the post loop trip count. Note there's another - // vector drain loop which is cloned from main loop before super-unrolling - // so the scalar post loop runs at most vlen-1 trips. Hence, this version - // only runs at most 1 iteration after vector mask transformation. - Node* trip_cnt; - Node* new_incr; - if (cl->stride_con() > 0) { - trip_cnt = new SubINode(cl->limit(), cl->init_trip()); - new_incr = new AddINode(cl->phi(), trip_cnt); - } else { - trip_cnt = new SubINode(cl->init_trip(), cl->limit()); - new_incr = new SubINode(cl->phi(), trip_cnt); - } - _igvn.register_new_node_with_optimizer(trip_cnt); - _igvn.register_new_node_with_optimizer(new_incr); - _igvn.replace_node(cl->incr(), new_incr); - Node* length = new ConvI2LNode(trip_cnt); - _igvn.register_new_node_with_optimizer(length); - Node* vmask = VectorMaskGenNode::make(length, vmask_bt); - _igvn.register_new_node_with_optimizer(vmask); - - // Remove exit test to transform 1-iteration loop to straight-line code. - // This results in redundant cmp+branch instructions been eliminated. - Node *cl_exit = cl->loopexit(); - _igvn.replace_input_of(cl_exit, 1, _igvn.intcon(0)); - return vmask; -} - //------------------------------vector_opd--------------------------- // Create a vector operand for the nodes in pack p for operand: in(opd_idx) Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { @@ -3098,19 +2876,11 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); bool have_same_inputs = same_inputs(p, opd_idx); - if (cl->is_rce_post_loop()) { - // override vlen with the main loops vector length - assert(p->size() == 1, "Packs in post loop should have only one node"); - vlen = cl->slp_max_unroll(); - } - // Insert index population operation to create a vector of increasing // indices starting from the iv value. In some special unrolled loops // (see JDK-8286125), we need scalar replications of the iv value if - // all inputs are the same iv, so we do a same inputs check here. But - // in post loops, "have_same_inputs" is always true because all packs - // are singleton. That's why a pack size check is also required. - if (opd == iv() && (!have_same_inputs || p->size() == 1)) { + // all inputs are the same iv, so we do a same inputs check here. + if (opd == iv() && !have_same_inputs) { BasicType p0_bt = velt_basic_type(p0); BasicType iv_bt = is_subword_type(p0_bt) ? p0_bt : T_INT; assert(VectorNode::is_populate_index_supported(iv_bt), "Should support"); @@ -4026,7 +3796,6 @@ void SuperWord::init() { _packset.clear(); _disjoint_ptrs.clear(); _block.clear(); - _post_block.clear(); _data_entry.clear(); _mem_slice_head.clear(); _mem_slice_tail.clear(); @@ -4447,34 +4216,6 @@ void SWPointer::maybe_add_to_invar(Node* new_invar, bool negate) { _invar = register_if_new(add); } -//-----------------has_potential_dependence----------------- -// Check potential data dependence among all memory accesses. -// We require every two accesses (with at least one store) of -// the same element type has the same address expression. -bool SWPointer::has_potential_dependence(GrowableArray swptrs) { - for (int i1 = 0; i1 < swptrs.length(); i1++) { - SWPointer* p1 = swptrs.at(i1); - MemNode* n1 = p1->mem(); - BasicType bt1 = n1->memory_type(); - - // Iterate over remaining SWPointers - for (int i2 = i1 + 1; i2 < swptrs.length(); i2++) { - SWPointer* p2 = swptrs.at(i2); - MemNode* n2 = p2->mem(); - BasicType bt2 = n2->memory_type(); - - // Data dependence exists between load-store, store-load - // or store-store with the same element type or subword - // size (subword load/store may have inaccurate type) - if ((n1->is_Store() || n2->is_Store()) && - same_type_or_subword_size(bt1, bt2) && !p1->equal(*p2)) { - return true; - } - } - } - return false; -} - //----------------------------print------------------------ void SWPointer::print() { #ifndef PRODUCT diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 2fcc169f8af2b..01a99589ec841 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -285,7 +285,6 @@ class SuperWord : public ResourceObj { GrowableArray _bb_idx; // Map from Node _idx to index within block GrowableArray _block; // Nodes in current block - GrowableArray _post_block; // Nodes in post loop block GrowableArray _data_entry; // Nodes with all inputs from outside GrowableArray _mem_slice_head; // Memory slice head nodes GrowableArray _mem_slice_tail; // Memory slice tail nodes @@ -579,8 +578,6 @@ class SuperWord : public ResourceObj { // Convert packs into vector node operations bool output(); - // Create vector mask for post loop vectorization - Node* create_post_loop_vmask(); // Create a vector operand for the nodes in pack p for operand: in(opd_idx) Node* vector_opd(Node_List* p, int opd_idx); // Can code be generated for pack p? @@ -725,8 +722,6 @@ class SWPointer : public ArenaObj { static bool equal(int cmp) { return cmp == Equal; } static bool comparable(int cmp) { return cmp < NotComparable; } - static bool has_potential_dependence(GrowableArray swptrs); - void print(); #ifndef PRODUCT diff --git a/test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java b/test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java deleted file mode 100644 index c55a9add80f66..0000000000000 --- a/test/hotspot/jtreg/compiler/rangechecks/TestRangeCheckEliminationDisabled.java +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved. - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This code is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License version 2 only, as - * published by the Free Software Foundation. - * - * This code is distributed in the hope that it will be useful, but WITHOUT - * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or - * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License - * version 2 for more details (a copy is included in the LICENSE file that - * accompanied this code). - * - * You should have received a copy of the GNU General Public License version - * 2 along with this work; if not, write to the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. - * - * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA - * or visit www.oracle.com if you need additional information or have any - * questions. - */ - -/** - * @test TestRangeCheckEliminationDisabled - * @bug 8154763 - * @summary Tests PostLoopMultiversioning with RangeCheckElimination disabled. - * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions - * -XX:+UnlockExperimentalVMOptions -XX:+PostLoopMultiversioning -XX:-RangeCheckElimination - * compiler.rangechecks.TestRangeCheckEliminationDisabled - */ - -package compiler.rangechecks; - -public class TestRangeCheckEliminationDisabled { - - public static void main(String[] args) { - System.out.println("Passed"); - } -} - diff --git a/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java b/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java index 845733aeb0d63..2a4c0839b514a 100644 --- a/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java +++ b/test/hotspot/jtreg/compiler/vectorization/runner/VectorizationTestRunner.java @@ -61,9 +61,6 @@ protected void run() { // each test method returning a primitive value or an array of primitive type. // And each test method should not throw any exceptions. Class klass = getClass(); - // Add extra VM options to verify experimental auto-vectorization - WB.setBooleanVMFlag("UnlockExperimentalVMOptions", true); - WB.setBooleanVMFlag("PostLoopMultiversioning", true); for (Method method : klass.getDeclaredMethods()) { try { if (method.isAnnotationPresent(Test.class)) {