From edf26bba9d84dbef40daf2bc4514335a7519333a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Oct 2025 18:09:55 +0200 Subject: [PATCH 01/39] 8340093 --- src/hotspot/share/opto/vtransform.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 7ad7b432e9b43..bfd124a6ed772 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -51,6 +51,10 @@ // - Compute linearization of the VTransformGraph, into an order that respects // all edges in the graph (bailout if cycle detected). // +// - Cost-Model: +// - We use a cost-model as a heuristic to determine if vectorization is profitable. +// Compute the cost of the loop with and without vectorization. +// // - Apply: // - Changes to the C2 IR are only made once the "apply" method is called. // - Align the main loop, by adjusting pre loop limit. From ce4ce1f0fd50cbe0f015132b8270f4d173803121 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 15 Oct 2025 09:08:46 +0200 Subject: [PATCH 02/39] add cost to matcher --- src/hotspot/share/opto/matcher.cpp | 27 +++++++++++++++++++++++++++ src/hotspot/share/opto/matcher.hpp | 5 +++++ 2 files changed, 32 insertions(+) diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 7d73487cf8840..49cd1a4051561 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2677,6 +2677,33 @@ void Matcher::specialize_generic_vector_operands() { } } +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float Matcher::cost_for_scalar(int opcode) { + return 1; +} + +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float Matcher::cost_for_vector(int opcode, int vlen, BasicType bt) { + return 1; +} + +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float Matcher::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) { + if (requires_strict_order) { + // Linear: shuffle and reduce + return 2 * vlen; + } else { + // Recursive: shuffle and reduce + return 2 * exact_log2(vlen); + } +} + uint Matcher::vector_length(const Node* n) { const TypeVect* vt = n->bottom_type()->is_vect(); return vt->length(); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 0b609b70ab5e9..7ab070bec4ab5 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -333,6 +333,11 @@ class Matcher : public PhaseTransform { static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen); + // Cost-Model for Auto-Vectorization + static float cost_for_scalar(int opcode); + static float cost_for_vector(int opcode, int vlen, BasicType bt); + static float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order); + static const RegMask* predicate_reg_mask(void); // Vector width in bytes From 8ac7d0ae12c16f7e894acbb3dd49774cdd88ead8 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 20 Oct 2025 15:59:50 +0200 Subject: [PATCH 03/39] rm old reduction heuristic --- src/hotspot/share/opto/superword.cpp | 67 +++------------------------- src/hotspot/share/opto/superword.hpp | 2 - 2 files changed, 7 insertions(+), 62 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index c0f005048ec66..11577af656eba 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : ), _vpointer_for_main_loop_alignment(nullptr), _aw_for_main_loop_alignment(0), - _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style - _num_work_vecs(0), // amount of vector work we have - _num_reductions(0) // amount of reduction work we have + _do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style { } @@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() { // Remove packs that are not profitable. void SuperWord::filter_packs_for_profitable() { - // Count the number of reductions vs other vector ops, for the - // reduction profitability heuristic. - for (int i = 0; i < _packset.length(); i++) { - Node_List* pack = _packset.at(i); - Node* n = pack->at(0); - if (is_marked_reduction(n)) { - _num_reductions++; - } else { - _num_work_vecs++; - } - } - // Remove packs that are not profitable auto filter = [&](const Node_List* pack) { return profitable(pack); @@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const { if (p0 != nullptr) { int opc = p0->Opcode(); if (is_marked_reduction(p0)) { - const Type *arith_type = p0->bottom_type(); - // This heuristic predicts that 2-element reductions for INT/LONG are not - // profitable. This heuristic was added in JDK-8078563. The argument - // was that reductions are not just a single instruction, but multiple, and - // hence it is not directly clear that they are profitable. If we only have - // two elements per vector, then the performance gains from non-reduction - // vectors are at most going from 2 scalar instructions to 1 vector instruction. - // But a 2-element reduction vector goes from 2 scalar instructions to - // 3 instructions (1 shuffle and two reduction ops). - // However, this optimization assumes that these reductions stay in the loop - // which may not be true any more in most cases after the introduction of: - // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop - // Hence, this heuristic has room for improvement. - bool is_two_element_int_or_long_reduction = (size == 2) && - (arith_type->basic_type() == T_INT || - arith_type->basic_type() == T_LONG); - if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) { -#ifndef PRODUCT - if (is_trace_superword_rejections()) { - tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable."); - tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); - } -#endif - return false; - } + const Type* arith_type = p0->bottom_type(); retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); } else if (VectorNode::is_convert_opcode(opc)) { retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0)); @@ -1772,26 +1734,6 @@ bool SuperWord::profitable(const Node_List* p) const { // The second input has to be the vector we wanted to reduce, // but it was not packed. return false; - } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) { - // This heuristic predicts that the reduction is not profitable. - // Reduction vectors can be expensive, because they require multiple - // operations to fold all the lanes together. Hence, vectorizing the - // reduction is not profitable on its own. Hence, we need a lot of - // other "work vectors" that deliver performance improvements to - // balance out the performance loss due to reductions. - // This heuristic is a bit simplistic, and assumes that the reduction - // vector stays in the loop. But in some cases, we can move the - // reduction out of the loop, replacing it with a single vector op. - // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop - // Hence, this heuristic has room for improvement. -#ifndef PRODUCT - if (is_trace_superword_rejections()) { - tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make"); - tty->print_cr(" reduction profitable."); - tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); - } -#endif - return false; } else if (second_pk->size() != p->size()) { return false; } @@ -1950,6 +1892,9 @@ bool SuperWord::do_vtransform() const { vtransform.optimize(); if (!vtransform.schedule()) { return false; } + + // TODO: use AutoVectorizationOverrideProfitability + // Maybe order it after the general bailout? if (vtransform.has_store_to_load_forwarding_failure()) { return false; } if (AutoVectorizationOverrideProfitability == 0) { @@ -1961,6 +1906,8 @@ bool SuperWord::do_vtransform() const { return false; } + // TODO: check cost + vtransform.apply(); return true; } diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 118e0aa042c79..9654465220b9c 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -549,8 +549,6 @@ class SuperWord : public ResourceObj { private: bool _do_vector_loop; // whether to do vectorization/simd style - int _num_work_vecs; // Number of non memory vector operations - int _num_reductions; // Number of reduction expressions applied // Accessors Arena* arena() { return &_arena; } From 57e69dfea5c8cac2dd6f289a89e59abea6d152dd Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 20 Oct 2025 16:14:42 +0200 Subject: [PATCH 04/39] refactor with is_profitable --- src/hotspot/share/opto/superword.cpp | 25 ++++++++++++++++++++----- src/hotspot/share/opto/vtransform.hpp | 1 + 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 11577af656eba..50e4ebece3072 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1893,22 +1893,37 @@ bool SuperWord::do_vtransform() const { if (!vtransform.schedule()) { return false; } - // TODO: use AutoVectorizationOverrideProfitability - // Maybe order it after the general bailout? - if (vtransform.has_store_to_load_forwarding_failure()) { return false; } + if (!vtransform.is_profitable()) { return false; } + + vtransform.apply(); + return true; +} + +bool VTransform::is_profitable() const { + assert(_graph.is_scheduled(), "must already be scheduled"); if (AutoVectorizationOverrideProfitability == 0) { #ifndef PRODUCT - if (is_trace_superword_any()) { + if (_trace._info) { tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0)."); } #endif return false; } + if (AutoVectorizationOverrideProfitability == 2) { +#ifndef PRODUCT + if (_trace._info) { + tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2)."); + } +#endif + return true; + } + + if (has_store_to_load_forwarding_failure()) { return false; } + // TODO: check cost - vtransform.apply(); return true; } diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index bfd124a6ed772..b0902cab5ca9f 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -256,6 +256,7 @@ class VTransform : public StackObj { void optimize() { return _graph.optimize(*this); } bool schedule() { return _graph.schedule(); } + bool is_profitable() const; bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply(); From 30d916f842f651fac5466116979da608b7309932 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 20 Oct 2025 16:32:16 +0200 Subject: [PATCH 05/39] code transfer wip --- src/hotspot/share/opto/superword.cpp | 19 ++++- src/hotspot/share/opto/vectorization.cpp | 87 +++++++++++++++++++++++ src/hotspot/share/opto/vectorization.hpp | 11 +++ src/hotspot/share/opto/vtransform.cpp | 89 ++++++++++++++++++++++++ src/hotspot/share/opto/vtransform.hpp | 2 + 5 files changed, 205 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 50e4ebece3072..f8d247bf6dd32 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1899,6 +1899,7 @@ bool SuperWord::do_vtransform() const { return true; } +// TODO: move to other file bool VTransform::is_profitable() const { assert(_graph.is_scheduled(), "must already be scheduled"); @@ -1920,11 +1921,23 @@ bool VTransform::is_profitable() const { return true; } + // Note: currently we only do throughput-based cost-modeling. In the future, we could + // also implement latency-based cost-modeling and take store-to-load-forwarding + // failures into account as the latency between the load and store. This would + // allow a more precise tradeoff between the forwarding failure penalty versus + // the vectorization gains. if (has_store_to_load_forwarding_failure()) { return false; } - // TODO: check cost - - return true; + // Cost-model + float scalar_cost = _vloop_analyzer.cost(); + float vector_cost = cost(); +#ifndef PRODUCT + if (_trace._info) { + tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f", + scalar_cost, vector_cost); + } +#endif + return vector_cost < scalar_cost; } // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 5c4e15fdbb916..84e81b214c9b7 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -541,6 +541,93 @@ void VLoopDependencyGraph::PredsIterator::next() { } } +//bool VLoopAnalyzer::has_zero_cost(Node* n) const { +// // Outside body? +// if (!_vloop.in_bb(n)) { return true; } +// +// // Internal nodes of pointer expressions are most likely folded into +// // the load / store and have no additional cost. +// if (vpointers().is_in_pointer_expression(n)) { return true; } +// +// if (n->is_AddP() || // Pointer expression +// n->is_CFG() || // CFG +// n->is_Phi() || // CFG +// n->is_Cmp() || // CFG +// n->is_Bool()) { // CFG +// return true; +// } +// +// // All other nodes have a non-zero cost. +// return false; +//} + +// Compute the cost over all operations in the (scalar) loop. +float VLoopAnalyzer::cost() const { + return 0; +} + +// TODO: impl +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr("\nVLoopAnalyzer::cost:"); +// } +//#endif +// +// float sum = 0; +// for (int j = 0; j < body().body().length(); j++) { +// Node* n = body().body().at(j); +// if (!has_zero_cost(n)) { +// float c = cost_for_scalar(n->Opcode()); +// sum += c; +//#ifndef PRODUCT +// if (_vloop.is_trace_cost_verbose()) { +// tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name()); +// } +//#endif +// } +// } +// +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr(" total_cost = %.2f", sum); +// } +//#endif +// return sum; +//} +// +//float VLoopAnalyzer::cost_for_scalar(int opcode) const { +// float c = Matcher::cost_for_scalar(opcode); +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); +// } +//#endif +// return c; +//} +// +//float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { +// float c = Matcher::cost_for_vector(opcode, vlen, bt); +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", +// c, NodeClassNames[opcode], vlen, type2name(bt)); +// } +//#endif +// return c; +//} +// +//float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { +// float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order); +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", +// c, NodeClassNames[opcode], vlen, type2name(bt), +// requires_strict_order ? "true" : "false"); +// } +//#endif +// return c; +//} + // Computing aliasing runtime check using init and last of main-loop // ----------------------------------------------------------------- // diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b1be52d531a51..aa72980db83bb 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -810,6 +810,17 @@ class VLoopAnalyzer : StackObj { const VLoopVPointers& vpointers() const { return _vpointers; } const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; } + // Compute the cost of the (scalar) body. + float cost() const; + // TODO: impl + // bool has_zero_cost(Node* n) const; + + // TODO: impl + // // Cost-modeling with tracing. + // float cost_for_scalar(int opcode) const; + // float cost_for_vector(int opcode, int vlen, BasicType bt) const; + // float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; + private: bool setup_submodules(); VStatus setup_submodules_helper(); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 46e8f43cb657d..eb982e3ccbe9f 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -186,6 +186,95 @@ int VTransformGraph::count_alive_vtnodes() const { return count; } +// Find all nodes that in the loop, in a 2-phase process: +// - First, find all nodes that are not before the loop: +// - loop-phis +// - loads and stores that are in the loop +// - and all their transitive uses. +// - Second, we find all nodes that are not after the loop: +// - backedges +// - loads and stores that are in the loop +// - and all their transitive uses. +//void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { +// assert(is_scheduled(), "must already be scheduled"); +// +// // Phase 1: find all nodes that are not before the loop. +// VectorSet is_not_before_loop; +// for (int i = 0; i < _schedule.length(); i++) { +// VTransformNode* vtn = _schedule.at(i); +// // Is vtn a loop-phi? +// if (vtn->isa_LoopPhi() != nullptr || +// vtn->is_load_or_store_in_loop()) { +// is_not_before_loop.set(vtn->_idx); +// continue; +// } +// // Or one of its transitive uses? +// for (uint j = 0; j < vtn->req(); j++) { +// VTransformNode* def = vtn->in(j); +// if (def != nullptr && is_not_before_loop.test(def->_idx)) { +// is_not_before_loop.set(vtn->_idx); +// break; +// } +// } +// } +// +// // Phase 2: find all nodes that are not after the loop. +// for (int i = _schedule.length()-1; i >= 0; i--) { +// VTransformNode* vtn = _schedule.at(i); +// if (!is_not_before_loop.test(vtn->_idx)) { continue; } +// // Is load or store? +// if (vtn->is_load_or_store_in_loop()) { +// in_loop.set(vtn->_idx); +// continue; +// } +// for (int i = 0; i < vtn->outs(); i++) { +// VTransformNode* use = vtn->out(i); +// // Or is vtn a backedge or one of its transitive defs? +// if (in_loop.test(use->_idx) || +// use->isa_LoopPhi() != nullptr) { +// in_loop.set(vtn->_idx); +// break; +// } +// } +// } +//} + +float VTransformGraph::cost() const { + assert(is_scheduled(), "must already be scheduled"); + return 1; +} +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr("\nVTransformGraph::cost:"); +// } +//#endif +// +// ResourceMark rm; +// VectorSet in_loop; +// mark_vtnodes_in_loop(in_loop); +// +// float sum = 0; +// for (int i = 0; i < _schedule.length(); i++) { +// VTransformNode* vtn = _schedule.at(i); +// if (!in_loop.test(vtn->_idx)) { continue; } +// float c = vtn->cost(_vloop_analyzer); +// sum += c; +//#ifndef PRODUCT +// if (c != 0 && _vloop.is_trace_cost_verbose()) { +// tty->print(" -> cost = %.2f for ", c); +// vtn->print(); +// } +//#endif +// } +// +//#ifndef PRODUCT +// if (_vloop.is_trace_cost()) { +// tty->print_cr(" total_cost = %.2f", sum); +// } +//#endif +// return sum; +//} + #ifndef PRODUCT void VTransformGraph::trace_schedule_cycle(const GrowableArray& stack, const VectorSet& pre_visited, diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index b0902cab5ca9f..60efaa544b206 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -194,6 +194,7 @@ class VTransformGraph : public StackObj { void optimize(VTransform& vtransform); bool schedule(); bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const; + float cost() const; void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; private: @@ -257,6 +258,7 @@ class VTransform : public StackObj { void optimize() { return _graph.optimize(*this); } bool schedule() { return _graph.schedule(); } bool is_profitable() const; + float cost() const { return _graph.cost(); } bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply(); From da3b4b3134a4e204237660465ae8b6800577eda7 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 20 Oct 2025 16:46:29 +0200 Subject: [PATCH 06/39] wip apply code --- .../share/opto/traceAutoVectorizationTag.hpp | 2 + src/hotspot/share/opto/vectorization.hpp | 8 ++ src/hotspot/share/opto/vtransform.cpp | 116 +++++++++--------- src/hotspot/share/opto/vtransform.hpp | 1 + 4 files changed, 70 insertions(+), 57 deletions(-) diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index d996173aeb43b..aac3d09f44995 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -47,6 +47,8 @@ flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ flags(VTRANSFORM, "Trace VTransform Graph") \ flags(OPTIMIZATION, "Trace VTransform::optimize") \ + flags(COST, "Trace cost of VLoop (scalar) and VTransform (vector)") \ + flags(COST_VERBOSE, "Trace like COST, but more verbose") \ flags(ALIGN_VECTOR, "Trace AlignVector") \ flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \ flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \ diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index aa72980db83bb..25f5b99b904f5 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -209,6 +209,14 @@ class VLoop : public StackObj { return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION); } + bool is_trace_cost() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::COST); + } + + bool is_trace_cost_verbose() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE); + } + bool is_trace_speculative_runtime_checks() const { return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS); } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index eb982e3ccbe9f..46db23c5e52e9 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -195,65 +195,67 @@ int VTransformGraph::count_alive_vtnodes() const { // - backedges // - loads and stores that are in the loop // - and all their transitive uses. -//void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { -// assert(is_scheduled(), "must already be scheduled"); // -// // Phase 1: find all nodes that are not before the loop. -// VectorSet is_not_before_loop; -// for (int i = 0; i < _schedule.length(); i++) { -// VTransformNode* vtn = _schedule.at(i); -// // Is vtn a loop-phi? -// if (vtn->isa_LoopPhi() != nullptr || -// vtn->is_load_or_store_in_loop()) { -// is_not_before_loop.set(vtn->_idx); -// continue; -// } -// // Or one of its transitive uses? -// for (uint j = 0; j < vtn->req(); j++) { -// VTransformNode* def = vtn->in(j); -// if (def != nullptr && is_not_before_loop.test(def->_idx)) { -// is_not_before_loop.set(vtn->_idx); -// break; -// } -// } -// } -// -// // Phase 2: find all nodes that are not after the loop. -// for (int i = _schedule.length()-1; i >= 0; i--) { -// VTransformNode* vtn = _schedule.at(i); -// if (!is_not_before_loop.test(vtn->_idx)) { continue; } -// // Is load or store? -// if (vtn->is_load_or_store_in_loop()) { -// in_loop.set(vtn->_idx); -// continue; -// } -// for (int i = 0; i < vtn->outs(); i++) { -// VTransformNode* use = vtn->out(i); -// // Or is vtn a backedge or one of its transitive defs? -// if (in_loop.test(use->_idx) || -// use->isa_LoopPhi() != nullptr) { -// in_loop.set(vtn->_idx); -// break; -// } -// } -// } -//} +// in_loop: vtn->_idx -> bool +void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { + assert(is_scheduled(), "must already be scheduled"); + + // Phase 1: find all nodes that are not before the loop. + VectorSet is_not_before_loop; + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + // Is vtn a loop-phi? + if (vtn->isa_LoopPhi() != nullptr || + // TODO: what about VTransformCountedLoopNode? + vtn->is_load_or_store_in_loop()) { + is_not_before_loop.set(vtn->_idx); + continue; + } + // Or one of its transitive uses? + for (uint j = 0; j < vtn->req(); j++) { + VTransformNode* def = vtn->in_req(j); + if (def != nullptr && is_not_before_loop.test(def->_idx)) { + is_not_before_loop.set(vtn->_idx); + break; + } + } + } + + // Phase 2: find all nodes that are not after the loop. + for (int i = _schedule.length()-1; i >= 0; i--) { + VTransformNode* vtn = _schedule.at(i); + if (!is_not_before_loop.test(vtn->_idx)) { continue; } + // Is load or store? + if (vtn->is_load_or_store_in_loop()) { + in_loop.set(vtn->_idx); + continue; + } + for (uint i = 0; i < vtn->out_strong_edges(); i++) { + VTransformNode* use = vtn->out_strong_edge(i); + // Or is vtn a backedge or one of its transitive defs? + if (in_loop.test(use->_idx) || + use->isa_LoopPhi() != nullptr) { + in_loop.set(vtn->_idx); + break; + } + } + // TODO: what about CFG nodes? + } +} float VTransformGraph::cost() const { assert(is_scheduled(), "must already be scheduled"); - return 1; -} -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr("\nVTransformGraph::cost:"); -// } -//#endif -// -// ResourceMark rm; -// VectorSet in_loop; -// mark_vtnodes_in_loop(in_loop); -// -// float sum = 0; +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr("\nVTransformGraph::cost:"); + } +#endif + + ResourceMark rm; + VectorSet in_loop; // vtn->_idx -> bool + mark_vtnodes_in_loop(in_loop); + + float sum = 0; // for (int i = 0; i < _schedule.length(); i++) { // VTransformNode* vtn = _schedule.at(i); // if (!in_loop.test(vtn->_idx)) { continue; } @@ -272,8 +274,8 @@ float VTransformGraph::cost() const { // tty->print_cr(" total_cost = %.2f", sum); // } //#endif -// return sum; -//} + return sum; +} #ifndef PRODUCT void VTransformGraph::trace_schedule_cycle(const GrowableArray& stack, diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 60efaa544b206..4c6d4fd70f9ad 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -205,6 +205,7 @@ class VTransformGraph : public StackObj { void collect_nodes_without_strong_in_edges(GrowableArray& stack) const; int count_alive_vtnodes() const; + void mark_vtnodes_in_loop(VectorSet& in_loop) const; #ifndef PRODUCT void print_vtnodes() const; From b7b5ac00a7f08769a686c249034a3478653dada3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 20 Oct 2025 17:25:20 +0200 Subject: [PATCH 07/39] wip impl cost for vtn --- src/hotspot/share/opto/vectorization.cpp | 106 ++++++++++++----------- src/hotspot/share/opto/vectorization.hpp | 14 ++- src/hotspot/share/opto/vtransform.cpp | 103 ++++++++++++++++++---- src/hotspot/share/opto/vtransform.hpp | 19 ++++ 4 files changed, 164 insertions(+), 78 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 84e81b214c9b7..0f59f746f21d1 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -541,25 +541,27 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -//bool VLoopAnalyzer::has_zero_cost(Node* n) const { -// // Outside body? -// if (!_vloop.in_bb(n)) { return true; } -// -// // Internal nodes of pointer expressions are most likely folded into -// // the load / store and have no additional cost. -// if (vpointers().is_in_pointer_expression(n)) { return true; } -// -// if (n->is_AddP() || // Pointer expression -// n->is_CFG() || // CFG -// n->is_Phi() || // CFG -// n->is_Cmp() || // CFG -// n->is_Bool()) { // CFG -// return true; -// } -// -// // All other nodes have a non-zero cost. -// return false; -//} +bool VLoopAnalyzer::has_zero_cost(Node* n) const { + // Outside body? + if (!_vloop.in_bb(n)) { return true; } + // TODO: can we widen this to the loop, not just bb? + + // Internal nodes of pointer expressions are most likely folded into + // the load / store and have no additional cost. + // TODO: implement + // if (vpointers().is_in_pointer_expression(n)) { return true; } + + if (n->is_AddP() || // Pointer expression + n->is_CFG() || // CFG + n->is_Phi() || // CFG + n->is_Cmp() || // CFG + n->is_Bool()) { // CFG + return true; + } + + // All other nodes have a non-zero cost. + return false; +} // Compute the cost over all operations in the (scalar) loop. float VLoopAnalyzer::cost() const { @@ -594,39 +596,39 @@ float VLoopAnalyzer::cost() const { //#endif // return sum; //} -// -//float VLoopAnalyzer::cost_for_scalar(int opcode) const { -// float c = Matcher::cost_for_scalar(opcode); -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); -// } -//#endif -// return c; -//} -// -//float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { -// float c = Matcher::cost_for_vector(opcode, vlen, bt); -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", -// c, NodeClassNames[opcode], vlen, type2name(bt)); -// } -//#endif -// return c; -//} -// -//float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { -// float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order); -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", -// c, NodeClassNames[opcode], vlen, type2name(bt), -// requires_strict_order ? "true" : "false"); -// } -//#endif -// return c; -//} + +float VLoopAnalyzer::cost_for_scalar(int opcode) const { + float c = Matcher::cost_for_scalar(opcode); +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); + } +#endif + return c; +} + +float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { + float c = Matcher::cost_for_vector(opcode, vlen, bt); +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", + c, NodeClassNames[opcode], vlen, type2name(bt)); + } +#endif + return c; +} + +float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { + float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order); +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", + c, NodeClassNames[opcode], vlen, type2name(bt), + requires_strict_order ? "true" : "false"); + } +#endif + return c; +} // Computing aliasing runtime check using init and last of main-loop // ----------------------------------------------------------------- diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 25f5b99b904f5..89f5778a45335 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -820,14 +820,12 @@ class VLoopAnalyzer : StackObj { // Compute the cost of the (scalar) body. float cost() const; - // TODO: impl - // bool has_zero_cost(Node* n) const; - - // TODO: impl - // // Cost-modeling with tracing. - // float cost_for_scalar(int opcode) const; - // float cost_for_vector(int opcode, int vlen, BasicType bt) const; - // float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; + bool has_zero_cost(Node* n) const; + + // Cost-modeling with tracing. + float cost_for_scalar(int opcode) const; + float cost_for_vector(int opcode, int vlen, BasicType bt) const; + float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; private: bool setup_submodules(); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 46db23c5e52e9..542bc194511b5 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -256,24 +256,24 @@ float VTransformGraph::cost() const { mark_vtnodes_in_loop(in_loop); float sum = 0; -// for (int i = 0; i < _schedule.length(); i++) { -// VTransformNode* vtn = _schedule.at(i); -// if (!in_loop.test(vtn->_idx)) { continue; } -// float c = vtn->cost(_vloop_analyzer); -// sum += c; -//#ifndef PRODUCT -// if (c != 0 && _vloop.is_trace_cost_verbose()) { -// tty->print(" -> cost = %.2f for ", c); -// vtn->print(); -// } -//#endif -// } -// -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr(" total_cost = %.2f", sum); -// } -//#endif + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + if (!in_loop.test(vtn->_idx)) { continue; } + float c = vtn->cost(_vloop_analyzer); + sum += c; +#ifndef PRODUCT + if (c != 0 && _vloop.is_trace_cost_verbose()) { + tty->print(" -> cost = %.2f for ", c); + vtn->print(); + } +#endif + } + +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" total_cost = %.2f", sum); + } +#endif return sum; } @@ -922,6 +922,10 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app } } +float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_scalar(_node->Opcode()); +} + VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const { apply_vtn_inputs_to_node(_node, apply_state); // The memory state has to be applied separately: the vtn does not hold it. This allows reordering. @@ -934,6 +938,10 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app return VTransformApplyResult::make_scalar(_node); } +float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_scalar(_node->Opcode()); +} + VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const { apply_vtn_inputs_to_node(_node, apply_state); return VTransformApplyResult::make_scalar(_node); @@ -986,6 +994,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta return VTransformApplyResult::make_scalar(_node); } +float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector(Op_Replicate, _vlen, _element_type); +} + VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const { Node* val = apply_state.transformed_node(in_req(1)); VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type); @@ -993,6 +1005,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply return VTransformApplyResult::make_vector(vn); } +float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_scalar(Op_ConvI2L); +} + VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const { Node* val = apply_state.transformed_node(in_req(1)); Node* n = new ConvI2LNode(val); @@ -1000,6 +1016,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s return VTransformApplyResult::make_scalar(n); } +float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode); + return vloop_analyzer.cost_for_scalar(Op_AndI) + + vloop_analyzer.cost_for_vector(shift_count_opc, _vlen, _element_bt); +} + VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const { PhaseIdealLoop* phase = apply_state.phase(); Node* shift_count_in = apply_state.transformed_node(in_req(1)); @@ -1015,6 +1037,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl return VTransformApplyResult::make_vector(vn); } +float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector(Op_PopulateIndex, _vlen, _element_bt); +} VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const { PhaseIdealLoop* phase = apply_state.phase(); @@ -1027,6 +1052,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a return VTransformApplyResult::make_vector(vn); } +float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()); +} + VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const { assert(2 <= req() && req() <= 4, "Must have 1-3 inputs"); const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length()); @@ -1045,6 +1074,13 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat return VTransformApplyResult::make_vector(vn); } +float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + // // TODO: implement, consider cast etc. + // return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()) + + // vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), XXX); + return 2; +} + VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const { uint vlen = vector_length(); int sopc = scalar_opcode(); @@ -1060,6 +1096,12 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply( return VTransformApplyResult::make_vector(vn); } +float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + // TODO: implement + //return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()); + return 1; +} + VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const { const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length()); const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length()); @@ -1072,6 +1114,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat return VTransformApplyResult::make_vector(vn); } +float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + assert(scalar_opcode() == Op_Bool, ""); + return vloop_analyzer.cost_for_vector(Op_VectorMaskCmp, vector_length(), element_basic_type()); +} + VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const { const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length()); assert(scalar_opcode() == Op_Bool, ""); @@ -1327,6 +1374,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou return true; // success } +float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + int vopc = vector_reduction_opcode(); + bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc); + return vloop_analyzer.cost_for_vector_reduction(vopc, vlen, bt, requires_strict_order); +} + VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const { Node* init = apply_state.transformed_node(in_req(1)); Node* vec = apply_state.transformed_node(in_req(2)); @@ -1336,6 +1391,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& return VTransformApplyResult::make_vector(vn, vn->vect_type()); } +float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + return vloop_analyzer.cost_for_vector(Op_LoadVector, vlen, bt); +} + VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const { int sopc = scalar_opcode(); uint vlen = vector_length(); @@ -1365,6 +1426,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl return VTransformApplyResult::make_vector(vn, vn->vect_type()); } +float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + return vloop_analyzer.cost_for_vector(Op_StoreVector, vlen, bt); +} + VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const { int sopc = scalar_opcode(); uint vlen = vector_length(); diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 4c6d4fd70f9ad..a887300806ce9 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -557,6 +557,8 @@ class VTransformNode : public ArenaObj { virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0; + virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0; virtual void apply_backedge(VTransformApplyState& apply_state) const {}; void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const; @@ -587,6 +589,7 @@ class VTransformMemopScalarNode : public VTransformNode { virtual bool is_load_or_store_in_loop() const override { return true; } virtual const VPointer& vpointer() const override { return _vpointer; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -603,6 +606,7 @@ class VTransformDataScalarNode : public VTransformNode { assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name()); } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -620,6 +624,7 @@ class VTransformLoopPhiNode : public VTransformNode { } virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; virtual void apply_backedge(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };) @@ -637,6 +642,7 @@ class VTransformCFGNode : public VTransformNode { assert(_node->is_CFG(), "must be CFG node: %s", _node->Name()); } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -663,6 +669,7 @@ class VTransformOuterNode : public VTransformNode { VTransformNode(vtransform, n->req()), _node(n) {} virtual VTransformOuterNode* isa_Outer() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -676,6 +683,7 @@ class VTransformReplicateNode : public VTransformNode { public: VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) : VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -685,6 +693,7 @@ class VTransformReplicateNode : public VTransformNode { class VTransformConvI2LNode : public VTransformNode { public: VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };) }; @@ -699,6 +708,7 @@ class VTransformShiftCountNode : public VTransformNode { public: VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) : VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -712,6 +722,7 @@ class VTransformPopulateIndexNode : public VTransformNode { public: VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) : VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -777,6 +788,7 @@ class VTransformElementWiseVectorNode : public VTransformVectorNode { VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) : VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {} virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -789,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect public: VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) : VTransformVectorNode(vtransform, 2, properties) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };) }; @@ -799,6 +812,7 @@ class VTransformReinterpretVectorNode : public VTransformVectorNode { public: VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) : VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -819,6 +833,7 @@ class VTransformCmpVectorNode : public VTransformVectorNode { VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) : VTransformVectorNode(vtransform, 3, properties) {} virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); } NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };) }; @@ -831,6 +846,7 @@ class VTransformBoolVectorNode : public VTransformVectorNode { VTransformVectorNode(vtransform, 2, properties), _test(test) {} VTransformBoolTest test() const { return _test; } virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -843,6 +859,7 @@ class VTransformReductionVectorNode : public VTransformVectorNode { VTransformVectorNode(vtransform, 3, properties) {} virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; } virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override; + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };) @@ -885,6 +902,7 @@ class VTransformLoadVectorNode : public VTransformMemVectorNode { LoadNode::ControlDependency control_dependency() const; virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } virtual bool is_load_in_loop() const override { return true; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) }; @@ -896,6 +914,7 @@ class VTransformStoreVectorNode : public VTransformMemVectorNode { VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {} virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } virtual bool is_load_in_loop() const override { return false; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) }; From 49f9242c09c57ee133d8d76762a2f653cb9ad58b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 22 Oct 2025 08:56:35 +0200 Subject: [PATCH 08/39] impl more cost methods --- src/hotspot/share/opto/vtransform.cpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 542bc194511b5..cec4cff7fb49b 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -1075,10 +1075,9 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat } float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - // // TODO: implement, consider cast etc. - // return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()) + - // vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), XXX); - return 2; + int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type()); + return vloop_analyzer.cost_for_vector(vopc, vector_length(), element_basic_type()) + + vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), T_INT); } VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1097,9 +1096,7 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply( } float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - // TODO: implement - //return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()); - return 1; + return vloop_analyzer.cost_for_vector(Op_VectorReinterpret, vector_length(), element_basic_type()); } VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const { From b32afed895aa72d39c07fe149bb8a7e51cb13c76 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 22 Oct 2025 09:03:15 +0200 Subject: [PATCH 09/39] fix comment --- src/hotspot/share/opto/superword.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f8d247bf6dd32..e359fa87e10c0 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1899,7 +1899,8 @@ bool SuperWord::do_vtransform() const { return true; } -// TODO: move to other file +// Check Cost-Model, and other heuristics. +// Can be overridden with AutoVectorizationOverrideProfitability. bool VTransform::is_profitable() const { assert(_graph.is_scheduled(), "must already be scheduled"); From a77059f8e0f3661da7423e225ba834cab56d53e4 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 22 Oct 2025 09:07:18 +0200 Subject: [PATCH 10/39] scalar cost --- src/hotspot/share/opto/vectorization.cpp | 57 +++++++++++------------- 1 file changed, 27 insertions(+), 30 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 0f59f746f21d1..31a28cca305c9 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -565,37 +565,34 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Compute the cost over all operations in the (scalar) loop. float VLoopAnalyzer::cost() const { - return 0; -} +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr("\nVLoopAnalyzer::cost:"); + } +#endif -// TODO: impl -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr("\nVLoopAnalyzer::cost:"); -// } -//#endif -// -// float sum = 0; -// for (int j = 0; j < body().body().length(); j++) { -// Node* n = body().body().at(j); -// if (!has_zero_cost(n)) { -// float c = cost_for_scalar(n->Opcode()); -// sum += c; -//#ifndef PRODUCT -// if (_vloop.is_trace_cost_verbose()) { -// tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name()); -// } -//#endif -// } -// } -// -//#ifndef PRODUCT -// if (_vloop.is_trace_cost()) { -// tty->print_cr(" total_cost = %.2f", sum); -// } -//#endif -// return sum; -//} + float sum = 0; + // TODO: does this go over the whole loop, or just the basic block? + for (int j = 0; j < body().body().length(); j++) { + Node* n = body().body().at(j); + if (!has_zero_cost(n)) { + float c = cost_for_scalar(n->Opcode()); + sum += c; +#ifndef PRODUCT + if (_vloop.is_trace_cost_verbose()) { + tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name()); + } +#endif + } + } + +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" total_cost = %.2f", sum); + } +#endif + return sum; +} float VLoopAnalyzer::cost_for_scalar(int opcode) const { float c = Matcher::cost_for_scalar(opcode); From a8f11c496f02d778aa0455d8cba8afcbaf5c29ee Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 08:13:00 +0200 Subject: [PATCH 11/39] ignore pointer expression nodes --- .../share/opto/traceAutoVectorizationTag.hpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 5 ++- src/hotspot/share/opto/vectorization.hpp | 36 ++++++++++++++++++- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index aac3d09f44995..4f67aff9b0706 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -38,7 +38,7 @@ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(BODY, "Trace VLoopBody") \ flags(TYPES, "Trace VLoopTypes") \ - flags(POINTERS, "Trace VLoopPointers") \ + flags(POINTERS, "Trace VLoopVPointers") \ flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \ flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 31a28cca305c9..b2cc4200015bb 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); + ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; }); @@ -548,8 +548,7 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Internal nodes of pointer expressions are most likely folded into // the load / store and have no additional cost. - // TODO: implement - // if (vpointers().is_in_pointer_expression(n)) { return true; } + if (vpointers().is_in_pointer_expression(n)) { return true; } if (n->is_AddP() || // Pointer expression n->is_CFG() || // CFG diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 89f5778a45335..419c29d6544a5 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -592,6 +592,32 @@ class VLoopTypes : public StackObj { const Type* container_type(Node* n) const; }; +// Mark all nodes from the loop that are part of any VPointer expression. +class PointerExpressionNodes : public MemPointerParserCallback { +private: + const VLoop& _vloop; + const VLoopBody& _body; + VectorSet _in_pointer_expression; + +public: + PointerExpressionNodes(Arena* arena, + const VLoop& vloop, + const VLoopBody& body) : + _vloop(vloop), + _body(body), + _in_pointer_expression(arena) {} + + virtual void callback(Node* n) override { + if (!_vloop.in_bb(n)) { return; } + _in_pointer_expression.set(_body.bb_idx(n)); + } + + bool contains(const Node* n) const { + if (!_vloop.in_bb(n)) { return false; } + return _in_pointer_expression.test(_body.bb_idx(n)); + } +}; + // Submodule of VLoopAnalyzer. // We compute and cache the VPointer for every load and store. class VLoopVPointers : public StackObj { @@ -607,6 +633,9 @@ class VLoopVPointers : public StackObj { // Map bb_idx -> index in _vpointers. -1 if not mapped. GrowableArray _bb_idx_to_vpointer; + // Mark all nodes that are part of any pointers expression. + PointerExpressionNodes _pointer_expression_nodes; + public: VLoopVPointers(Arena* arena, const VLoop& vloop, @@ -618,13 +647,18 @@ class VLoopVPointers : public StackObj { _bb_idx_to_vpointer(arena, vloop.estimated_body_length(), vloop.estimated_body_length(), - -1) {} + -1), + _pointer_expression_nodes(arena, _vloop, _body) {} NONCOPYABLE(VLoopVPointers); void compute_vpointers(); const VPointer& vpointer(const MemNode* mem) const; NOT_PRODUCT( void print() const; ) + bool is_in_pointer_expression(const Node* n) const { + return _pointer_expression_nodes.contains(n); + } + private: void count_vpointers(); void allocate_vpointers_array(); From 693dcf1aca9bad6df66bf5a1fae61b0d5896f78e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 09:37:45 +0200 Subject: [PATCH 12/39] zero cost for data scalar nodes that have zero cost --- src/hotspot/share/opto/vectorization.cpp | 1 + src/hotspot/share/opto/vtransform.cpp | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index b2cc4200015bb..79320948d2110 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -541,6 +541,7 @@ void VLoopDependencyGraph::PredsIterator::next() { } } +// TODO: Description bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Outside body? if (!_vloop.in_bb(n)) { return true; } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index cec4cff7fb49b..0fca37ca39ba3 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -923,6 +923,8 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app } float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + // This is an identity transform, but loads and stores must be counted. + assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted"); return vloop_analyzer.cost_for_scalar(_node->Opcode()); } @@ -939,7 +941,13 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app } float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_scalar(_node->Opcode()); + // Since this is an identity transform, we may have nodes that also + // VLoopAnalyzer::cost does not count for the scalar loop. + if (vloop_analyzer.has_zero_cost(_node)) { + return 0; + } else { + return vloop_analyzer.cost_for_scalar(_node->Opcode()); + } } VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const { From 2a9aba2bb5d4aeb8a108d4f6c84da84d698fa029 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 10:00:48 +0200 Subject: [PATCH 13/39] improve documentation and fix test --- src/hotspot/share/opto/vectorization.cpp | 8 +++++++- ...tAutoVectorizationOverrideProfitability.java | 17 +++++++++-------- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 79320948d2110..3527e502afb7a 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -541,7 +541,8 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -// TODO: Description +// Cost-model heuristic for nodes that do not contribute to computatinal +// cost inside the loop. bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Outside body? if (!_vloop.in_bb(n)) { return true; } @@ -551,6 +552,11 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const { // the load / store and have no additional cost. if (vpointers().is_in_pointer_expression(n)) { return true; } + // Not all AddP nodes can be detected in VPointer parsing, so + // we filter them out here. + // We don't want to explicitly model the cost of control flow, + // since we have the same CFG structure before and after + // vectorization: A loop head, a loop exit, with a backedge. if (n->is_AddP() || // Pointer expression n->is_CFG() || // CFG n->is_Phi() || // CFG diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java index 10ad19d03a74d..89b46871cb56a 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java @@ -115,17 +115,18 @@ public static void checkSimpleFloatCopy() { @Test @Warmup(10) @IR(applyIfCPUFeatureOr = {"avx", "true"}, - applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}, counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"}) @IR(applyIfCPUFeatureOr = {"avx", "true"}, - applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}, counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"}) - // Current heuristics say that this simple int reduction is not profitable. - // But it would actually be profitable, since we are able to move the - // reduction out of the loop (we can reorder the reduction). When moving - // the reduction out of the loop, we instead accumulate with a simple - // ADD_VI inside the loop. - // See: JDK-8307516 JDK-8345044 + // We are able to vectorize the reduction. But on its own, that would + // not reduce the cost sufficiently in all cases, because vectorized + // reduction nodes are expensive. But since integer addition is associative + // we can move the reduction vector out of the loop. Instead, we accumulate + // with a simple ADD_VI inside the loop, which is very cheap. After the + // loop, we only need to use the vectorized reduction once, to collapse + // the partial sums contained in the lanes. private static int simpleIntReduction() { int sum = 0; for (int i = 0; i < aI.length; i++) { From baa41e4b8f85173795db7b47ae062a8e1731eccd Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 10:51:48 +0200 Subject: [PATCH 14/39] fix another test --- .../jtreg/compiler/c2/cr7200264/TestIntVect.java | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java index 457e33667b2d1..76c33ec1b0772 100644 --- a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java +++ b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java @@ -410,12 +410,12 @@ public void run() { } - // Not vectorized: simple addition not profitable, see JDK-8307516. NOTE: - // This check does not document the _desired_ behavior of the system but - // the current behavior (no vectorization) @Test - @IR(counts = { IRNode.LOAD_VECTOR_I, "= 0", - IRNode.STORE_VECTOR, "= 0" }) + @IR(counts = { IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0" }) + // The reduction is moved outside the loop, and we use a + // element-wise accumulator inside the loop. int test_sum(int[] a1) { int sum = 0; for (int i = 0; i < a1.length; i+=1) { From 24a6c338724c074171595f283a12eccad4d76eba Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 10:59:52 +0200 Subject: [PATCH 15/39] resolve some todos --- src/hotspot/share/opto/vtransform.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 0fca37ca39ba3..954a915cbaea4 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -206,7 +206,6 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { VTransformNode* vtn = _schedule.at(i); // Is vtn a loop-phi? if (vtn->isa_LoopPhi() != nullptr || - // TODO: what about VTransformCountedLoopNode? vtn->is_load_or_store_in_loop()) { is_not_before_loop.set(vtn->_idx); continue; @@ -239,7 +238,6 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { break; } } - // TODO: what about CFG nodes? } } From 5373397ede7410fd7995458a09ad535b973209e7 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 23 Oct 2025 11:05:32 +0200 Subject: [PATCH 16/39] resolve more TODOS --- src/hotspot/share/opto/vectorization.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 3527e502afb7a..15e4248cf409a 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -546,7 +546,6 @@ void VLoopDependencyGraph::PredsIterator::next() { bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Outside body? if (!_vloop.in_bb(n)) { return true; } - // TODO: can we widen this to the loop, not just bb? // Internal nodes of pointer expressions are most likely folded into // the load / store and have no additional cost. @@ -578,7 +577,6 @@ float VLoopAnalyzer::cost() const { #endif float sum = 0; - // TODO: does this go over the whole loop, or just the basic block? for (int j = 0; j < body().body().length(); j++) { Node* n = body().body().at(j); if (!has_zero_cost(n)) { From f0d9fa285e3f4a28efc7f6f0f25d6d479cc19b7e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 16:11:51 +0200 Subject: [PATCH 17/39] wip reductions IR test --- .../loopopts/superword/TestReductions.java | 176 ++++++++++++++++++ 1 file changed, 176 insertions(+) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java new file mode 100644 index 0000000000000..fcabe4963f029 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -0,0 +1,176 @@ +/* + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions xxxx + */ + +package compiler.loopopts.superword; + +import jdk.test.lib.Utils; +import java.util.Map; +import java.util.HashMap; +import java.util.Random; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import static compiler.lib.generators.Generators.G; +import compiler.lib.generators.Generator; + +/** + * Note: there is a corresponding JMH benchmark: + * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java + */ +public class TestReductions { + static int SIZE = 1024*8; + private static final Random RANDOM = Utils.getRandomInstance(); + public static final Generator GEN_I = G.ints(); + public static final Generator GEN_F = G.floats(); + + private static byte[] in1B = fillRandom(new byte[SIZE]); + private static byte[] in2B = fillRandom(new byte[SIZE]); + private static byte[] in3B = fillRandom(new byte[SIZE]); + //private static char[] in1C = fillRandom(new char[SIZE]); + //private static char[] in2C = fillRandom(new char[SIZE]); + //private static char[] in3C = fillRandom(new char[SIZE]); + //private static short[] in1S = fillRandom(new short[SIZE]); + //private static short[] in2S = fillRandom(new short[SIZE]); + //private static short[] in3S = fillRandom(new short[SIZE]); + + private static int[] in1I = fillRandom(new int[SIZE]); + private static int[] in2I = fillRandom(new int[SIZE]); + private static int[] in3I = fillRandom(new int[SIZE]); + //private static long[] in1L = fillRandom(new long[SIZE]); + //private static long[] in2L = fillRandom(new long[SIZE]); + //private static long[] in3L = fillRandom(new long[SIZE]); + + //private static float[] in1F = fillRandom(new float[SIZE]); + //private static float[] in2F = fillRandom(new float[SIZE]); + //private static float[] in3F = fillRandom(new float[SIZE]); + //private static double[] in1D = fillRandom(new doulbe[SIZE]); + //private static double[] in2D = fillRandom(new doulbe[SIZE]); + //private static double[] in3D = fillRandom(new doulbe[SIZE]); + + interface TestFunction { + Object run(); + } + + // Map of test names to tests. + Map tests = new HashMap(); + + // Map of gold, the results from the first run (before compilation), one per tests entry. + Map golds = new HashMap(); + + public static void main(String[] args) { + TestFramework framework = new TestFramework(TestReductions.class); + switch (args[0]) { + case "xxxx" -> { framework.addFlags("-XX:-AlignVector"); } + default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } + }; + framework.start(); + } + + public TestReductions() { + // Add all tests to list + tests.put("test1", TestReductions::test1); + tests.put("test2", TestReductions::test2); + + // Compute gold value for all test methods before compilation + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + Object gold = test.run(); + golds.put(name, gold); + } + } + + @Warmup(100) + @Run(test = {"test1", + "test2"}) + public void runTests() { + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + // Recall gold value from before compilation + Object gold = golds.get(name); + // Compute new result + Object result = test.run(); + // Compare gold and new result + try { + Verify.checkEQ(gold, result); + } catch (VerifyException e) { + throw new RuntimeException("Verify failed for " + name, e); + } + } + } + + static byte[] fillRandom(byte[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (byte)(int)GEN_I.next(); + } + return a; + } + + static int[] fillRandom(int[] a) { + G.fill(GEN_I, a); + return a; + } + + @Test + // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + // IRNode.STORE_VECTOR, "> 0", + // ".*multiversion.*", "= 0"}, + // phase = CompilePhase.PRINT_IDEAL, + // applyIfPlatform = {"64-bit", "true"}, + // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // // Should always vectorize, no speculative runtime check required. + static byte test1() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc += val; + } + return acc; + } + + @Test + // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + // IRNode.STORE_VECTOR, "> 0", + // ".*multiversion.*", "= 0"}, + // phase = CompilePhase.PRINT_IDEAL, + // applyIfPlatform = {"64-bit", "true"}, + // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // // Should always vectorize, no speculative runtime check required. + static byte test2() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc *= val; + } + return acc; + } +} From 35eec33c11c8cd1d86a81551d42829f9ea21d42f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 16:12:09 +0200 Subject: [PATCH 18/39] linking comment --- .../micro/org/openjdk/bench/vm/compiler/VectorReduction2.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java index ec614cb324bc2..63fbf03008301 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java @@ -28,6 +28,10 @@ import java.util.concurrent.TimeUnit; import java.util.Random; +/** + * Note: there is a corresponding IR test: + * test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java + */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Thread) From 8e4a2ce0f658b91c66401b5a17fa4fa70c18a41b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 16:17:45 +0200 Subject: [PATCH 19/39] wip test --- .../loopopts/superword/TestReductions.java | 64 +++++++++++++------ 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index fcabe4963f029..4072cc1fc3505 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -31,10 +31,8 @@ package compiler.loopopts.superword; -import jdk.test.lib.Utils; import java.util.Map; import java.util.HashMap; -import java.util.Random; import compiler.lib.ir_framework.*; import compiler.lib.verify.*; @@ -47,33 +45,34 @@ */ public class TestReductions { static int SIZE = 1024*8; - private static final Random RANDOM = Utils.getRandomInstance(); public static final Generator GEN_I = G.ints(); + public static final Generator GEN_L = G.longs(); public static final Generator GEN_F = G.floats(); + public static final Generator GEN_D = G.doubles(); private static byte[] in1B = fillRandom(new byte[SIZE]); private static byte[] in2B = fillRandom(new byte[SIZE]); private static byte[] in3B = fillRandom(new byte[SIZE]); - //private static char[] in1C = fillRandom(new char[SIZE]); - //private static char[] in2C = fillRandom(new char[SIZE]); - //private static char[] in3C = fillRandom(new char[SIZE]); - //private static short[] in1S = fillRandom(new short[SIZE]); - //private static short[] in2S = fillRandom(new short[SIZE]); - //private static short[] in3S = fillRandom(new short[SIZE]); + private static char[] in1C = fillRandom(new char[SIZE]); + private static char[] in2C = fillRandom(new char[SIZE]); + private static char[] in3C = fillRandom(new char[SIZE]); + private static short[] in1S = fillRandom(new short[SIZE]); + private static short[] in2S = fillRandom(new short[SIZE]); + private static short[] in3S = fillRandom(new short[SIZE]); private static int[] in1I = fillRandom(new int[SIZE]); private static int[] in2I = fillRandom(new int[SIZE]); private static int[] in3I = fillRandom(new int[SIZE]); - //private static long[] in1L = fillRandom(new long[SIZE]); - //private static long[] in2L = fillRandom(new long[SIZE]); - //private static long[] in3L = fillRandom(new long[SIZE]); + private static long[] in1L = fillRandom(new long[SIZE]); + private static long[] in2L = fillRandom(new long[SIZE]); + private static long[] in3L = fillRandom(new long[SIZE]); - //private static float[] in1F = fillRandom(new float[SIZE]); - //private static float[] in2F = fillRandom(new float[SIZE]); - //private static float[] in3F = fillRandom(new float[SIZE]); - //private static double[] in1D = fillRandom(new doulbe[SIZE]); - //private static double[] in2D = fillRandom(new doulbe[SIZE]); - //private static double[] in3D = fillRandom(new doulbe[SIZE]); + private static float[] in1F = fillRandom(new float[SIZE]); + private static float[] in2F = fillRandom(new float[SIZE]); + private static float[] in3F = fillRandom(new float[SIZE]); + private static double[] in1D = fillRandom(new double[SIZE]); + private static double[] in2D = fillRandom(new double[SIZE]); + private static double[] in3D = fillRandom(new double[SIZE]); interface TestFunction { Object run(); @@ -135,11 +134,40 @@ static byte[] fillRandom(byte[] a) { return a; } + static char[] fillRandom(char[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (char)(int)GEN_I.next(); + } + return a; + } + + static short[] fillRandom(short[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (short)(int)GEN_I.next(); + } + return a; + } + static int[] fillRandom(int[] a) { G.fill(GEN_I, a); return a; } + static long[] fillRandom(long[] a) { + G.fill(GEN_L, a); + return a; + } + + static float[] fillRandom(float[] a) { + G.fill(GEN_F, a); + return a; + } + + static double[] fillRandom(double[] a) { + G.fill(GEN_D, a); + return a; + } + @Test // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", // IRNode.STORE_VECTOR, "> 0", From ed16cf6cc9f8a5b72296c67dd1481bb6f31d449f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 16:38:23 +0200 Subject: [PATCH 20/39] added tests --- .../loopopts/superword/TestReductions.java | 1597 ++++++++++++++++- 1 file changed, 1571 insertions(+), 26 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 4072cc1fc3505..6266c08386f28 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -44,11 +44,11 @@ * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java */ public class TestReductions { - static int SIZE = 1024*8; - public static final Generator GEN_I = G.ints(); - public static final Generator GEN_L = G.longs(); - public static final Generator GEN_F = G.floats(); - public static final Generator GEN_D = G.doubles(); + private static int SIZE = 1024*8; + private static final Generator GEN_I = G.ints(); + private static final Generator GEN_L = G.longs(); + private static final Generator GEN_F = G.floats(); + private static final Generator GEN_D = G.doubles(); private static byte[] in1B = fillRandom(new byte[SIZE]); private static byte[] in2B = fillRandom(new byte[SIZE]); @@ -95,8 +95,141 @@ public static void main(String[] args) { public TestReductions() { // Add all tests to list - tests.put("test1", TestReductions::test1); - tests.put("test2", TestReductions::test2); + tests.put("byteAndSimple", TestReductions::byteAndSimple); + tests.put("byteOrSimple", TestReductions::byteOrSimple); + tests.put("byteXorSimple", TestReductions::byteXorSimple); + tests.put("byteAddSimple", TestReductions::byteAddSimple); + tests.put("byteMulSimple", TestReductions::byteMulSimple); + tests.put("byteMinSimple", TestReductions::byteMinSimple); + tests.put("byteMaxSimple", TestReductions::byteMaxSimple); + tests.put("byteAndDotProduct", TestReductions::byteAndDotProduct); + tests.put("byteOrDotProduct", TestReductions::byteOrDotProduct); + tests.put("byteXorDotProduct", TestReductions::byteXorDotProduct); + tests.put("byteAddDotProduct", TestReductions::byteAddDotProduct); + tests.put("byteMulDotProduct", TestReductions::byteMulDotProduct); + tests.put("byteMinDotProduct", TestReductions::byteMinDotProduct); + tests.put("byteMaxDotProduct", TestReductions::byteMaxDotProduct); + tests.put("byteAndBig", TestReductions::byteAndBig); + tests.put("byteOrBig", TestReductions::byteOrBig); + tests.put("byteXorBig", TestReductions::byteXorBig); + tests.put("byteAddBig", TestReductions::byteAddBig); + tests.put("byteMulBig", TestReductions::byteMulBig); + tests.put("byteMinBig", TestReductions::byteMinBig); + tests.put("byteMaxBig", TestReductions::byteMaxBig); + + tests.put("charAndSimple", TestReductions::charAndSimple); + tests.put("charOrSimple", TestReductions::charOrSimple); + tests.put("charXorSimple", TestReductions::charXorSimple); + tests.put("charAddSimple", TestReductions::charAddSimple); + tests.put("charMulSimple", TestReductions::charMulSimple); + tests.put("charMinSimple", TestReductions::charMinSimple); + tests.put("charMaxSimple", TestReductions::charMaxSimple); + tests.put("charAndDotProduct", TestReductions::charAndDotProduct); + tests.put("charOrDotProduct", TestReductions::charOrDotProduct); + tests.put("charXorDotProduct", TestReductions::charXorDotProduct); + tests.put("charAddDotProduct", TestReductions::charAddDotProduct); + tests.put("charMulDotProduct", TestReductions::charMulDotProduct); + tests.put("charMinDotProduct", TestReductions::charMinDotProduct); + tests.put("charMaxDotProduct", TestReductions::charMaxDotProduct); + tests.put("charAndBig", TestReductions::charAndBig); + tests.put("charOrBig", TestReductions::charOrBig); + tests.put("charXorBig", TestReductions::charXorBig); + tests.put("charAddBig", TestReductions::charAddBig); + tests.put("charMulBig", TestReductions::charMulBig); + tests.put("charMinBig", TestReductions::charMinBig); + tests.put("charMaxBig", TestReductions::charMaxBig); + + tests.put("shortAndSimple", TestReductions::shortAndSimple); + tests.put("shortOrSimple", TestReductions::shortOrSimple); + tests.put("shortXorSimple", TestReductions::shortXorSimple); + tests.put("shortAddSimple", TestReductions::shortAddSimple); + tests.put("shortMulSimple", TestReductions::shortMulSimple); + tests.put("shortMinSimple", TestReductions::shortMinSimple); + tests.put("shortMaxSimple", TestReductions::shortMaxSimple); + tests.put("shortAndDotProduct", TestReductions::shortAndDotProduct); + tests.put("shortOrDotProduct", TestReductions::shortOrDotProduct); + tests.put("shortXorDotProduct", TestReductions::shortXorDotProduct); + tests.put("shortAddDotProduct", TestReductions::shortAddDotProduct); + tests.put("shortMulDotProduct", TestReductions::shortMulDotProduct); + tests.put("shortMinDotProduct", TestReductions::shortMinDotProduct); + tests.put("shortMaxDotProduct", TestReductions::shortMaxDotProduct); + tests.put("shortAndBig", TestReductions::shortAndBig); + tests.put("shortOrBig", TestReductions::shortOrBig); + tests.put("shortXorBig", TestReductions::shortXorBig); + tests.put("shortAddBig", TestReductions::shortAddBig); + tests.put("shortMulBig", TestReductions::shortMulBig); + tests.put("shortMinBig", TestReductions::shortMinBig); + tests.put("shortMaxBig", TestReductions::shortMaxBig); + + tests.put("intAndSimple", TestReductions::intAndSimple); + tests.put("intOrSimple", TestReductions::intOrSimple); + tests.put("intXorSimple", TestReductions::intXorSimple); + tests.put("intAddSimple", TestReductions::intAddSimple); + tests.put("intMulSimple", TestReductions::intMulSimple); + tests.put("intMinSimple", TestReductions::intMinSimple); + tests.put("intMaxSimple", TestReductions::intMaxSimple); + tests.put("intAndDotProduct", TestReductions::intAndDotProduct); + tests.put("intOrDotProduct", TestReductions::intOrDotProduct); + tests.put("intXorDotProduct", TestReductions::intXorDotProduct); + tests.put("intAddDotProduct", TestReductions::intAddDotProduct); + tests.put("intMulDotProduct", TestReductions::intMulDotProduct); + tests.put("intMinDotProduct", TestReductions::intMinDotProduct); + tests.put("intMaxDotProduct", TestReductions::intMaxDotProduct); + tests.put("intAndBig", TestReductions::intAndBig); + tests.put("intOrBig", TestReductions::intOrBig); + tests.put("intXorBig", TestReductions::intXorBig); + tests.put("intAddBig", TestReductions::intAddBig); + tests.put("intMulBig", TestReductions::intMulBig); + tests.put("intMinBig", TestReductions::intMinBig); + tests.put("intMaxBig", TestReductions::intMaxBig); + + tests.put("longAndSimple", TestReductions::longAndSimple); + tests.put("longOrSimple", TestReductions::longOrSimple); + tests.put("longXorSimple", TestReductions::longXorSimple); + tests.put("longAddSimple", TestReductions::longAddSimple); + tests.put("longMulSimple", TestReductions::longMulSimple); + tests.put("longMinSimple", TestReductions::longMinSimple); + tests.put("longMaxSimple", TestReductions::longMaxSimple); + tests.put("longAndDotProduct", TestReductions::longAndDotProduct); + tests.put("longOrDotProduct", TestReductions::longOrDotProduct); + tests.put("longXorDotProduct", TestReductions::longXorDotProduct); + tests.put("longAddDotProduct", TestReductions::longAddDotProduct); + tests.put("longMulDotProduct", TestReductions::longMulDotProduct); + tests.put("longMinDotProduct", TestReductions::longMinDotProduct); + tests.put("longMaxDotProduct", TestReductions::longMaxDotProduct); + tests.put("longAndBig", TestReductions::longAndBig); + tests.put("longOrBig", TestReductions::longOrBig); + tests.put("longXorBig", TestReductions::longXorBig); + tests.put("longAddBig", TestReductions::longAddBig); + tests.put("longMulBig", TestReductions::longMulBig); + tests.put("longMinBig", TestReductions::longMinBig); + tests.put("longMaxBig", TestReductions::longMaxBig); + + tests.put("floatAddSimple", TestReductions::floatAddSimple); + tests.put("floatMulSimple", TestReductions::floatMulSimple); + tests.put("floatMinSimple", TestReductions::floatMinSimple); + tests.put("floatMaxSimple", TestReductions::floatMaxSimple); + tests.put("floatAddDotProduct", TestReductions::floatAddDotProduct); + tests.put("floatMulDotProduct", TestReductions::floatMulDotProduct); + tests.put("floatMinDotProduct", TestReductions::floatMinDotProduct); + tests.put("floatMaxDotProduct", TestReductions::floatMaxDotProduct); + tests.put("floatAddBig", TestReductions::floatAddBig); + tests.put("floatMulBig", TestReductions::floatMulBig); + tests.put("floatMinBig", TestReductions::floatMinBig); + tests.put("floatMaxBig", TestReductions::floatMaxBig); + + tests.put("doubleAddSimple", TestReductions::doubleAddSimple); + tests.put("doubleMulSimple", TestReductions::doubleMulSimple); + tests.put("doubleMinSimple", TestReductions::doubleMinSimple); + tests.put("doubleMaxSimple", TestReductions::doubleMaxSimple); + tests.put("doubleAddDotProduct", TestReductions::doubleAddDotProduct); + tests.put("doubleMulDotProduct", TestReductions::doubleMulDotProduct); + tests.put("doubleMinDotProduct", TestReductions::doubleMinDotProduct); + tests.put("doubleMaxDotProduct", TestReductions::doubleMaxDotProduct); + tests.put("doubleAddBig", TestReductions::doubleAddBig); + tests.put("doubleMulBig", TestReductions::doubleMulBig); + tests.put("doubleMinBig", TestReductions::doubleMinBig); + tests.put("doubleMaxBig", TestReductions::doubleMaxBig); // Compute gold value for all test methods before compilation for (Map.Entry entry : tests.entrySet()) { @@ -108,8 +241,141 @@ public TestReductions() { } @Warmup(100) - @Run(test = {"test1", - "test2"}) + @Run(test = {"byteAndSimple", + "byteOrSimple", + "byteXorSimple", + "byteAddSimple", + "byteMulSimple", + "byteMinSimple", + "byteMaxSimple", + "byteAndDotProduct", + "byteOrDotProduct", + "byteXorDotProduct", + "byteAddDotProduct", + "byteMulDotProduct", + "byteMinDotProduct", + "byteMaxDotProduct", + "byteAndBig", + "byteOrBig", + "byteXorBig", + "byteAddBig", + "byteMulBig", + "byteMinBig", + "byteMaxBig", + + "charAndSimple", + "charOrSimple", + "charXorSimple", + "charAddSimple", + "charMulSimple", + "charMinSimple", + "charMaxSimple", + "charAndDotProduct", + "charOrDotProduct", + "charXorDotProduct", + "charAddDotProduct", + "charMulDotProduct", + "charMinDotProduct", + "charMaxDotProduct", + "charAndBig", + "charOrBig", + "charXorBig", + "charAddBig", + "charMulBig", + "charMinBig", + "charMaxBig", + + "shortAndSimple", + "shortOrSimple", + "shortXorSimple", + "shortAddSimple", + "shortMulSimple", + "shortMinSimple", + "shortMaxSimple", + "shortAndDotProduct", + "shortOrDotProduct", + "shortXorDotProduct", + "shortAddDotProduct", + "shortMulDotProduct", + "shortMinDotProduct", + "shortMaxDotProduct", + "shortAndBig", + "shortOrBig", + "shortXorBig", + "shortAddBig", + "shortMulBig", + "shortMinBig", + "shortMaxBig", + + "intAndSimple", + "intOrSimple", + "intXorSimple", + "intAddSimple", + "intMulSimple", + "intMinSimple", + "intMaxSimple", + "intAndDotProduct", + "intOrDotProduct", + "intXorDotProduct", + "intAddDotProduct", + "intMulDotProduct", + "intMinDotProduct", + "intMaxDotProduct", + "intAndBig", + "intOrBig", + "intXorBig", + "intAddBig", + "intMulBig", + "intMinBig", + "intMaxBig", + + "longAndSimple", + "longOrSimple", + "longXorSimple", + "longAddSimple", + "longMulSimple", + "longMinSimple", + "longMaxSimple", + "longAndDotProduct", + "longOrDotProduct", + "longXorDotProduct", + "longAddDotProduct", + "longMulDotProduct", + "longMinDotProduct", + "longMaxDotProduct", + "longAndBig", + "longOrBig", + "longXorBig", + "longAddBig", + "longMulBig", + "longMinBig", + "longMaxBig", + + "floatAddSimple", + "floatMulSimple", + "floatMinSimple", + "floatMaxSimple", + "floatAddDotProduct", + "floatMulDotProduct", + "floatMinDotProduct", + "floatMaxDotProduct", + "floatAddBig", + "floatMulBig", + "floatMinBig", + "floatMaxBig", + + "doubleAddSimple", + "doubleMulSimple", + "doubleMinSimple", + "doubleMaxSimple", + "doubleAddDotProduct", + "doubleMulDotProduct", + "doubleMinDotProduct", + "doubleMaxDotProduct", + "doubleAddBig", + "doubleMulBig", + "doubleMinBig", + "doubleMaxBig"}) public void runTests() { for (Map.Entry entry : tests.entrySet()) { String name = entry.getKey(); @@ -168,37 +434,1316 @@ static double[] fillRandom(double[] a) { return a; } + // ---------byte***Simple ------------------------------------------------------------ + @Test + private static byte byteAndSimple() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc &= val; + } + return acc; + } + @Test - // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", - // IRNode.STORE_VECTOR, "> 0", - // ".*multiversion.*", "= 0"}, - // phase = CompilePhase.PRINT_IDEAL, - // applyIfPlatform = {"64-bit", "true"}, - // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // // Should always vectorize, no speculative runtime check required. - static byte test1() { + private static byte byteOrSimple() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { byte val = in1B[i]; - acc += val; + acc |= val; + } + return acc; + } + + @Test + private static byte byteXorSimple() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc ^= val; } return acc; } @Test - // @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", - // IRNode.STORE_VECTOR, "> 0", - // ".*multiversion.*", "= 0"}, - // phase = CompilePhase.PRINT_IDEAL, - // applyIfPlatform = {"64-bit", "true"}, - // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // // Should always vectorize, no speculative runtime check required. - static byte test2() { + private static byte byteAddSimple() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { byte val = in1B[i]; + acc += val; + } + return acc; + } + + @Test + private static byte byteMulSimple() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc *= val; + } + return acc; + } + + @Test + private static byte byteMinSimple() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + private static byte byteMaxSimple() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------byte***DotProduct ------------------------------------------------------------ + @Test + private static byte byteAndDotProduct() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc &= val; + } + return acc; + } + + @Test + private static byte byteOrDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc |= val; + } + return acc; + } + + @Test + private static byte byteXorDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc ^= val; + } + return acc; + } + + @Test + private static byte byteAddDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc += val; + } + return acc; + } + + @Test + private static byte byteMulDotProduct() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc *= val; + } + return acc; + } + + @Test + private static byte byteMinDotProduct() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + private static byte byteMaxDotProduct() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------byte***Big ------------------------------------------------------------ + @Test + private static byte byteAndBig() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc &= val; + } + return acc; + } + + @Test + private static byte byteOrBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc |= val; + } + return acc; + } + + @Test + private static byte byteXorBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc ^= val; + } + return acc; + } + + @Test + private static byte byteAddBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc += val; + } + return acc; + } + + @Test + private static byte byteMulBig() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc *= val; + } + return acc; + } + + @Test + private static byte byteMinBig() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + private static byte byteMaxBig() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------char***Simple ------------------------------------------------------------ + @Test + private static char charAndSimple() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc &= val; + } + return acc; + } + + @Test + private static char charOrSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc |= val; + } + return acc; + } + + @Test + private static char charXorSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc ^= val; + } + return acc; + } + + @Test + private static char charAddSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc += val; + } + return acc; + } + + @Test + private static char charMulSimple() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc *= val; + } + return acc; + } + + @Test + private static char charMinSimple() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + private static char charMaxSimple() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------char***DotProduct ------------------------------------------------------------ + @Test + private static char charAndDotProduct() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc &= val; + } + return acc; + } + + @Test + private static char charOrDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc |= val; + } + return acc; + } + + @Test + private static char charXorDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc ^= val; + } + return acc; + } + + @Test + private static char charAddDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc += val; + } + return acc; + } + + @Test + private static char charMulDotProduct() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); acc *= val; } return acc; } + + @Test + private static char charMinDotProduct() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + private static char charMaxDotProduct() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------char***Big ------------------------------------------------------------ + @Test + private static char charAndBig() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc &= val; + } + return acc; + } + + @Test + private static char charOrBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc |= val; + } + return acc; + } + + @Test + private static char charXorBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc ^= val; + } + return acc; + } + + @Test + private static char charAddBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc += val; + } + return acc; + } + + @Test + private static char charMulBig() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc *= val; + } + return acc; + } + + @Test + private static char charMinBig() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + private static char charMaxBig() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------short***Simple ------------------------------------------------------------ + @Test + private static short shortAndSimple() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc &= val; + } + return acc; + } + + @Test + private static short shortOrSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc |= val; + } + return acc; + } + + @Test + private static short shortXorSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc ^= val; + } + return acc; + } + + @Test + private static short shortAddSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc += val; + } + return acc; + } + + @Test + private static short shortMulSimple() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc *= val; + } + return acc; + } + + @Test + private static short shortMinSimple() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + private static short shortMaxSimple() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------short***DotProduct ------------------------------------------------------------ + @Test + private static short shortAndDotProduct() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc &= val; + } + return acc; + } + + @Test + private static short shortOrDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc |= val; + } + return acc; + } + + @Test + private static short shortXorDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc ^= val; + } + return acc; + } + + @Test + private static short shortAddDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc += val; + } + return acc; + } + + @Test + private static short shortMulDotProduct() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc *= val; + } + return acc; + } + + @Test + private static short shortMinDotProduct() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + private static short shortMaxDotProduct() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------short***Big ------------------------------------------------------------ + @Test + private static short shortAndBig() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc &= val; + } + return acc; + } + + @Test + private static short shortOrBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc |= val; + } + return acc; + } + + @Test + private static short shortXorBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc ^= val; + } + return acc; + } + + @Test + private static short shortAddBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc += val; + } + return acc; + } + + @Test + private static short shortMulBig() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc *= val; + } + return acc; + } + + @Test + private static short shortMinBig() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + private static short shortMaxBig() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------int***Simple ------------------------------------------------------------ + @Test + private static int intAndSimple() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc &= val; + } + return acc; + } + + @Test + private static int intOrSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc |= val; + } + return acc; + } + + @Test + private static int intXorSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc ^= val; + } + return acc; + } + + @Test + private static int intAddSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc += val; + } + return acc; + } + + @Test + private static int intMulSimple() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc *= val; + } + return acc; + } + + @Test + private static int intMinSimple() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static int intMaxSimple() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------int***DotProduct ------------------------------------------------------------ + @Test + private static int intAndDotProduct() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc &= val; + } + return acc; + } + + @Test + private static int intOrDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc |= val; + } + return acc; + } + + @Test + private static int intXorDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc ^= val; + } + return acc; + } + + @Test + private static int intAddDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc += val; + } + return acc; + } + + @Test + private static int intMulDotProduct() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc *= val; + } + return acc; + } + + @Test + private static int intMinDotProduct() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static int intMaxDotProduct() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------int***Big ------------------------------------------------------------ + @Test + private static int intAndBig() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc &= val; + } + return acc; + } + + @Test + private static int intOrBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc |= val; + } + return acc; + } + + @Test + private static int intXorBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc ^= val; + } + return acc; + } + + @Test + private static int intAddBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc += val; + } + return acc; + } + + @Test + private static int intMulBig() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc *= val; + } + return acc; + } + + @Test + private static int intMinBig() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static int intMaxBig() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***Simple ------------------------------------------------------------ + @Test + private static long longAndSimple() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc &= val; + } + return acc; + } + + @Test + private static long longOrSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc |= val; + } + return acc; + } + + @Test + private static long longXorSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc ^= val; + } + return acc; + } + + @Test + private static long longAddSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc += val; + } + return acc; + } + + @Test + private static long longMulSimple() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc *= val; + } + return acc; + } + + @Test + private static long longMinSimple() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static long longMaxSimple() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***DotProduct ------------------------------------------------------------ + @Test + private static long longAndDotProduct() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc &= val; + } + return acc; + } + + @Test + private static long longOrDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc |= val; + } + return acc; + } + + @Test + private static long longXorDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc ^= val; + } + return acc; + } + + @Test + private static long longAddDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc += val; + } + return acc; + } + + @Test + private static long longMulDotProduct() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc *= val; + } + return acc; + } + + @Test + private static long longMinDotProduct() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static long longMaxDotProduct() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***Big ------------------------------------------------------------ + @Test + private static long longAndBig() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc &= val; + } + return acc; + } + + @Test + private static long longOrBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc |= val; + } + return acc; + } + + @Test + private static long longXorBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc ^= val; + } + return acc; + } + + @Test + private static long longAddBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc += val; + } + return acc; + } + + @Test + private static long longMulBig() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc *= val; + } + return acc; + } + + @Test + private static long longMinBig() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static long longMaxBig() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***Simple ------------------------------------------------------------ + @Test + private static float floatAddSimple() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc += val; + } + return acc; + } + + @Test + private static float floatMulSimple() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc *= val; + } + return acc; + } + + @Test + private static float floatMinSimple() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static float floatMaxSimple() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***DotProduct ------------------------------------------------------------ + @Test + private static float floatAddDotProduct() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc += val; + } + return acc; + } + + @Test + private static float floatMulDotProduct() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc *= val; + } + return acc; + } + + @Test + private static float floatMinDotProduct() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static float floatMaxDotProduct() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***Big ------------------------------------------------------------ + @Test + private static float floatAddBig() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc += val; + } + return acc; + } + + @Test + private static float floatMulBig() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc *= val; + } + return acc; + } + + @Test + private static float floatMinBig() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static float floatMaxBig() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***Simple ------------------------------------------------------------ + @Test + private static double doubleAddSimple() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc += val; + } + return acc; + } + + @Test + private static double doubleMulSimple() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc *= val; + } + return acc; + } + + @Test + private static double doubleMinSimple() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static double doubleMaxSimple() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***DotProduct ------------------------------------------------------------ + @Test + private static double doubleAddDotProduct() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc += val; + } + return acc; + } + + @Test + private static double doubleMulDotProduct() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc *= val; + } + return acc; + } + + @Test + private static double doubleMinDotProduct() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static double doubleMaxDotProduct() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***Big ------------------------------------------------------------ + @Test + private static double doubleAddBig() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc += val; + } + return acc; + } + + @Test + private static double doubleMulBig() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc *= val; + } + return acc; + } + + @Test + private static double doubleMinBig() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + private static double doubleMaxBig() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc = Math.max(acc, val); + } + return acc; + } + + } From 802054a051af71c98c08ba0ff75e79cd7115cb16 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 17:00:06 +0200 Subject: [PATCH 21/39] wip IR rules --- .../loopopts/superword/TestReductions.java | 92 ++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 6266c08386f28..bd8bd2359ed2a 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -26,7 +26,23 @@ * @bug 8340093 * @summary Test vectorization of reduction loops. * @library /test/lib / - * @run driver compiler.loopopts.superword.TestReductions xxxx + * @run driver compiler.loopopts.superword.TestReductions P0 + */ + +/* + * @test + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions P1 + */ + +/* + * @test + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions P2 */ package compiler.loopopts.superword; @@ -87,7 +103,9 @@ interface TestFunction { public static void main(String[] args) { TestFramework framework = new TestFramework(TestReductions.class); switch (args[0]) { - case "xxxx" -> { framework.addFlags("-XX:-AlignVector"); } + case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); } + case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); } + case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2"); } default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } }; framework.start(); @@ -436,6 +454,7 @@ static double[] fillRandom(double[] a) { // ---------byte***Simple ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAndSimple() { byte acc = (byte)0xFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -446,6 +465,7 @@ private static byte byteAndSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteOrSimple() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -456,6 +476,7 @@ private static byte byteOrSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteXorSimple() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -466,6 +487,7 @@ private static byte byteXorSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAddSimple() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -476,6 +498,7 @@ private static byte byteAddSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMulSimple() { byte acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -486,6 +509,7 @@ private static byte byteMulSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMinSimple() { byte acc = Byte.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -496,6 +520,7 @@ private static byte byteMinSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMaxSimple() { byte acc = Byte.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -507,6 +532,7 @@ private static byte byteMaxSimple() { // ---------byte***DotProduct ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAndDotProduct() { byte acc = (byte)0xFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -517,6 +543,7 @@ private static byte byteAndDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteOrDotProduct() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -527,6 +554,7 @@ private static byte byteOrDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteXorDotProduct() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -537,6 +565,7 @@ private static byte byteXorDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAddDotProduct() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -547,6 +576,7 @@ private static byte byteAddDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMulDotProduct() { byte acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -557,6 +587,7 @@ private static byte byteMulDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMinDotProduct() { byte acc = Byte.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -567,6 +598,7 @@ private static byte byteMinDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMaxDotProduct() { byte acc = Byte.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -578,6 +610,7 @@ private static byte byteMaxDotProduct() { // ---------byte***Big ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAndBig() { byte acc = (byte)0xFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -588,6 +621,7 @@ private static byte byteAndBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteOrBig() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -598,6 +632,7 @@ private static byte byteOrBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteXorBig() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -608,6 +643,7 @@ private static byte byteXorBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteAddBig() { byte acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -618,6 +654,7 @@ private static byte byteAddBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMulBig() { byte acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -628,6 +665,7 @@ private static byte byteMulBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMinBig() { byte acc = Byte.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -638,6 +676,7 @@ private static byte byteMinBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. private static byte byteMaxBig() { byte acc = Byte.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -649,6 +688,7 @@ private static byte byteMaxBig() { // ---------char***Simple ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAndSimple() { char acc = (char)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -659,6 +699,7 @@ private static char charAndSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charOrSimple() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -669,6 +710,7 @@ private static char charOrSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charXorSimple() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -679,6 +721,7 @@ private static char charXorSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAddSimple() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -689,6 +732,7 @@ private static char charAddSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMulSimple() { char acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -699,6 +743,7 @@ private static char charMulSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMinSimple() { char acc = Character.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -709,6 +754,7 @@ private static char charMinSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMaxSimple() { char acc = Character.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -720,6 +766,7 @@ private static char charMaxSimple() { // ---------char***DotProduct ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAndDotProduct() { char acc = (char)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -730,6 +777,7 @@ private static char charAndDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charOrDotProduct() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -740,6 +788,7 @@ private static char charOrDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charXorDotProduct() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -750,6 +799,7 @@ private static char charXorDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAddDotProduct() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -760,6 +810,7 @@ private static char charAddDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMulDotProduct() { char acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -770,6 +821,7 @@ private static char charMulDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMinDotProduct() { char acc = Character.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -780,6 +832,7 @@ private static char charMinDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMaxDotProduct() { char acc = Character.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -791,6 +844,7 @@ private static char charMaxDotProduct() { // ---------char***Big ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAndBig() { char acc = (char)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -801,6 +855,7 @@ private static char charAndBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charOrBig() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -811,6 +866,7 @@ private static char charOrBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charXorBig() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -821,6 +877,7 @@ private static char charXorBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charAddBig() { char acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -831,6 +888,7 @@ private static char charAddBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMulBig() { char acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -841,6 +899,7 @@ private static char charMulBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMinBig() { char acc = Character.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -851,6 +910,7 @@ private static char charMinBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. private static char charMaxBig() { char acc = Character.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -862,6 +922,7 @@ private static char charMaxBig() { // ---------short***Simple ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAndSimple() { short acc = (short)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -872,6 +933,7 @@ private static short shortAndSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortOrSimple() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -882,6 +944,7 @@ private static short shortOrSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortXorSimple() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -892,6 +955,7 @@ private static short shortXorSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAddSimple() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -902,6 +966,7 @@ private static short shortAddSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMulSimple() { short acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -912,6 +977,7 @@ private static short shortMulSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMinSimple() { short acc = Short.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -922,6 +988,7 @@ private static short shortMinSimple() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMaxSimple() { short acc = Short.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -933,6 +1000,7 @@ private static short shortMaxSimple() { // ---------short***DotProduct ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAndDotProduct() { short acc = (short)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -943,6 +1011,7 @@ private static short shortAndDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortOrDotProduct() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -953,6 +1022,7 @@ private static short shortOrDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortXorDotProduct() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -963,6 +1033,7 @@ private static short shortXorDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAddDotProduct() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -973,6 +1044,7 @@ private static short shortAddDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMulDotProduct() { short acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -983,6 +1055,7 @@ private static short shortMulDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMinDotProduct() { short acc = Short.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -993,6 +1066,7 @@ private static short shortMinDotProduct() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMaxDotProduct() { short acc = Short.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1004,6 +1078,7 @@ private static short shortMaxDotProduct() { // ---------short***Big ------------------------------------------------------------ @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAndBig() { short acc = (short)0xFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1014,6 +1089,7 @@ private static short shortAndBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortOrBig() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1024,6 +1100,7 @@ private static short shortOrBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortXorBig() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1034,6 +1111,7 @@ private static short shortXorBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortAddBig() { short acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1044,6 +1122,7 @@ private static short shortAddBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMulBig() { short acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1054,6 +1133,7 @@ private static short shortMulBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMinBig() { short acc = Short.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1064,6 +1144,7 @@ private static short shortMinBig() { } @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. private static short shortMaxBig() { short acc = Short.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1075,6 +1156,13 @@ private static short shortMaxBig() { // ---------int***Simple ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAndSimple() { int acc = 0xFFFFFFFF; // neutral element for (int i = 0; i < SIZE; i++) { From 90691a85d7f9ff35e77edb0810db6fe994ba779f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 17:17:13 +0200 Subject: [PATCH 22/39] int ir rules --- .../loopopts/superword/TestReductions.java | 140 ++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index bd8bd2359ed2a..eb9b8d82b03df 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1173,6 +1173,13 @@ private static int intAndSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intOrSimple() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1183,6 +1190,13 @@ private static int intOrSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intXorSimple() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1193,6 +1207,13 @@ private static int intXorSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAddSimple() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1203,6 +1224,13 @@ private static int intAddSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMulSimple() { int acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1213,6 +1241,13 @@ private static int intMulSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMinSimple() { int acc = Integer.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1223,6 +1258,13 @@ private static int intMinSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMaxSimple() { int acc = Integer.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1234,6 +1276,13 @@ private static int intMaxSimple() { // ---------int***DotProduct ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAndDotProduct() { int acc = 0xFFFFFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1244,6 +1293,13 @@ private static int intAndDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intOrDotProduct() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1254,6 +1310,13 @@ private static int intOrDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intXorDotProduct() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1264,6 +1327,13 @@ private static int intXorDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAddDotProduct() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1274,6 +1344,13 @@ private static int intAddDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMulDotProduct() { int acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1284,6 +1361,13 @@ private static int intMulDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMinDotProduct() { int acc = Integer.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1294,6 +1378,13 @@ private static int intMinDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMaxDotProduct() { int acc = Integer.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1305,6 +1396,13 @@ private static int intMaxDotProduct() { // ---------int***Big ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAndBig() { int acc = 0xFFFFFFFF; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1315,6 +1413,13 @@ private static int intAndBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intOrBig() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1325,6 +1430,13 @@ private static int intOrBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intXorBig() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1335,6 +1447,13 @@ private static int intXorBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intAddBig() { int acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1345,6 +1464,13 @@ private static int intAddBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMulBig() { int acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1355,6 +1481,13 @@ private static int intMulBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMinBig() { int acc = Integer.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1365,6 +1498,13 @@ private static int intMinBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static int intMaxBig() { int acc = Integer.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { From 8ecbf717b5cf8ec8da97abf2335239a3bc656990 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 17:24:00 +0200 Subject: [PATCH 23/39] first long ir --- .../jtreg/compiler/loopopts/superword/TestReductions.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index eb9b8d82b03df..49880b0199b9c 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1516,6 +1516,13 @@ private static int intMaxBig() { // ---------long***Simple ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndSimple() { long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element for (int i = 0; i < SIZE; i++) { From d3dad214a76cad179822f05203da7526aa67581c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 17:58:59 +0200 Subject: [PATCH 24/39] long ir rules --- .../loopopts/superword/TestReductions.java | 156 ++++++++++++++++++ 1 file changed, 156 insertions(+) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 49880b0199b9c..a04233d4651aa 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1533,6 +1533,14 @@ private static long longAndSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1543,6 +1551,14 @@ private static long longOrSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1553,6 +1569,14 @@ private static long longXorSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1563,6 +1587,14 @@ private static long longAddSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMulSimple() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1573,6 +1605,14 @@ private static long longMulSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinSimple() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1583,6 +1623,14 @@ private static long longMinSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMaxSimple() { long acc = Long.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1594,6 +1642,14 @@ private static long longMaxSimple() { // ---------long***DotProduct ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAndDotProduct() { long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1604,6 +1660,14 @@ private static long longAndDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1614,6 +1678,14 @@ private static long longOrDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1624,6 +1696,14 @@ private static long longXorDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1634,6 +1714,14 @@ private static long longAddDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMulDotProduct() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1644,6 +1732,14 @@ private static long longMulDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinDotProduct() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1654,6 +1750,13 @@ private static long longMinDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxDotProduct() { long acc = Long.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1665,6 +1768,13 @@ private static long longMaxDotProduct() { // ---------long***Big ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndBig() { long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1675,6 +1785,14 @@ private static long longAndBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1685,6 +1803,14 @@ private static long longOrBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1695,6 +1821,14 @@ private static long longXorBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1705,6 +1839,13 @@ private static long longAddBig() { } @Test + //@IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + // IRNode.MUL_REDUCTION_VL, "> 0", + // IRNode.MUL_VL, "> 0"}, + // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + // applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L) + // TODO: investigate, file report / issue. private static long longMulBig() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1715,6 +1856,14 @@ private static long longMulBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinBig() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1725,6 +1874,13 @@ private static long longMinBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxBig() { long acc = Long.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { From b8251094af5e8275d3e42b23db1ae552cea507d6 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 24 Oct 2025 18:26:22 +0200 Subject: [PATCH 25/39] floating add ir test --- .../loopopts/superword/TestReductions.java | 48 +++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index a04233d4651aa..1e4b54dbdb5e3 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1892,6 +1892,16 @@ private static long longMaxBig() { // ---------float***Simple ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n adds + // Vector: n loads + n adds + n extract (sequential order of reduction) private static float floatAddSimple() { float acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1933,6 +1943,13 @@ private static float floatMaxSimple() { // ---------float***DotProduct ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddDotProduct() { float acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1974,6 +1991,13 @@ private static float floatMaxDotProduct() { // ---------float***Big ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddBig() { float acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2015,6 +2039,16 @@ private static float floatMaxBig() { // ---------double***Simple ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VD, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n adds + // Vector: n loads + n adds + n extract (sequential order of reduction) private static double doubleAddSimple() { double acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2056,6 +2090,13 @@ private static double doubleMaxSimple() { // ---------double***DotProduct ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VD, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddDotProduct() { double acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2097,6 +2138,13 @@ private static double doubleMaxDotProduct() { // ---------double***Big ------------------------------------------------------------ @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddBig() { double acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { From 0de4a974b775b6c5ac4a4432852a0833c317fae3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 25 Oct 2025 13:15:59 +0200 Subject: [PATCH 26/39] double ir tests --- .../loopopts/superword/TestReductions.java | 154 +++++++++++++++--- 1 file changed, 135 insertions(+), 19 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 1e4b54dbdb5e3..da58bf744474c 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1540,7 +1540,6 @@ private static long longAndSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longOrSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1558,7 +1557,6 @@ private static long longOrSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longXorSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1576,7 +1574,6 @@ private static long longXorSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longAddSimple() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1594,7 +1591,6 @@ private static long longAddSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMulSimple() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1612,7 +1608,6 @@ private static long longMulSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMinSimple() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1630,7 +1625,6 @@ private static long longMinSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMaxSimple() { long acc = Long.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1649,7 +1643,6 @@ private static long longMaxSimple() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longAndDotProduct() { long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1667,7 +1660,6 @@ private static long longAndDotProduct() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longOrDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1685,7 +1677,6 @@ private static long longOrDotProduct() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longXorDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1703,7 +1694,6 @@ private static long longXorDotProduct() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longAddDotProduct() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1721,7 +1711,6 @@ private static long longAddDotProduct() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMulDotProduct() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1739,7 +1728,6 @@ private static long longMulDotProduct() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMinDotProduct() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1792,7 +1780,6 @@ private static long longAndBig() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longOrBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1810,7 +1797,6 @@ private static long longOrBig() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longXorBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1828,7 +1814,6 @@ private static long longXorBig() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longAddBig() { long acc = 0; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1863,7 +1848,6 @@ private static long longMulBig() { applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - private static long longMinBig() { long acc = Long.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1912,6 +1896,16 @@ private static float floatAddSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n mul + // Vector: n loads + n mul + n extract (sequential order of reduction) private static float floatMulSimple() { float acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1922,6 +1916,13 @@ private static float floatMulSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMinSimple() { float acc = Float.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1932,6 +1933,13 @@ private static float floatMinSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMaxSimple() { float acc = Float.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1960,6 +1968,13 @@ private static float floatAddDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulDotProduct() { float acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1970,6 +1985,13 @@ private static float floatMulDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMinDotProduct() { float acc = Float.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1980,6 +2002,13 @@ private static float floatMinDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMaxDotProduct() { float acc = Float.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2008,6 +2037,13 @@ private static float floatAddBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulBig() { float acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2018,6 +2054,13 @@ private static float floatMulBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMinBig() { float acc = Float.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2028,6 +2071,13 @@ private static float floatMinBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMaxBig() { float acc = Float.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2039,9 +2089,9 @@ private static float floatMaxBig() { // ---------double***Simple ------------------------------------------------------------ @Test - @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", - IRNode.ADD_REDUCTION_V, "> 0", - IRNode.ADD_VD, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0", + IRNode.ADD_VD, "= 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) @IR(failOn = IRNode.LOAD_VECTOR_D, @@ -2059,6 +2109,16 @@ private static double doubleAddSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "= 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n mul + // Vector: n loads + n mul + n extract (sequential order of reduction) private static double doubleMulSimple() { double acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2069,6 +2129,13 @@ private static double doubleMulSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMinSimple() { double acc = Double.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2079,6 +2146,13 @@ private static double doubleMinSimple() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMaxSimple() { double acc = Double.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2107,6 +2181,13 @@ private static double doubleAddDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulDotProduct() { double acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2117,6 +2198,13 @@ private static double doubleMulDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMinDotProduct() { double acc = Double.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2127,6 +2215,13 @@ private static double doubleMinDotProduct() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMaxDotProduct() { double acc = Double.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2155,6 +2250,13 @@ private static double doubleAddBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulBig() { double acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2165,6 +2267,13 @@ private static double doubleMulBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMinBig() { double acc = Double.MAX_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { @@ -2175,6 +2284,13 @@ private static double doubleMinBig() { } @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMaxBig() { double acc = Double.MIN_VALUE; // neutral element for (int i = 0; i < SIZE; i++) { From ff4c1dad6ccea5270e061676d718ff866143aba2 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 08:38:20 +0100 Subject: [PATCH 27/39] AVX2 exception for min/max long --- .../loopopts/superword/TestReductions.java | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index da58bf744474c..cca7b37500769 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1604,8 +1604,11 @@ private static long longMulSimple() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinSimple() { @@ -1621,8 +1624,11 @@ private static long longMinSimple() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxSimple() { @@ -1724,8 +1730,11 @@ private static long longMulDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinDotProduct() { @@ -1741,8 +1750,11 @@ private static long longMinDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxDotProduct() { @@ -1844,8 +1856,11 @@ private static long longMulBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinBig() { @@ -1861,8 +1876,11 @@ private static long longMinBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxBig() { From af8dd438b7373a2bb9c3d967b3927a3781520972 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 08:59:55 +0100 Subject: [PATCH 28/39] avx2 exception for mul long --- .../compiler/loopopts/superword/TestReductions.java | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index cca7b37500769..d45c4afe91e3e 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1587,8 +1587,11 @@ private static long longAddSimple() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MUL_REDUCTION_VL, "> 0", IRNode.MUL_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370673 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMulSimple() { @@ -1713,8 +1716,11 @@ private static long longAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MUL_REDUCTION_VL, "> 0", IRNode.MUL_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370673 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMulDotProduct() { From d457b0461732965dc5a8b57afe834c746e45da10 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 09:09:28 +0100 Subject: [PATCH 29/39] AVX=0 ir rule adjustments --- .../loopopts/superword/TestReductions.java | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index d45c4afe91e3e..774abdf339440 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1943,7 +1943,7 @@ private static float floatMulSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -1960,7 +1960,7 @@ private static float floatMinSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2012,7 +2012,7 @@ private static float floatMulDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2029,7 +2029,7 @@ private static float floatMinDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2081,7 +2081,7 @@ private static float floatMulBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2098,7 +2098,7 @@ private static float floatMinBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2156,7 +2156,7 @@ private static double doubleMulSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2173,7 +2173,7 @@ private static double doubleMinSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2225,7 +2225,7 @@ private static double doubleMulDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2242,7 +2242,7 @@ private static double doubleMinDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2294,7 +2294,7 @@ private static double doubleMulBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) @@ -2311,7 +2311,7 @@ private static double doubleMinBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) From 180d066cc5dc00b8e35aeca526da94af39b2760c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 10:33:43 +0100 Subject: [PATCH 30/39] fix asimd add/mul f/d rules --- .../loopopts/superword/TestReductions.java | 78 +++++++++++++++---- 1 file changed, 63 insertions(+), 15 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 774abdf339440..c0f068df29c9e 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -22,7 +22,7 @@ */ /* - * @test + * @test id=no-vectorization * @bug 8340093 * @summary Test vectorization of reduction loops. * @library /test/lib / @@ -30,7 +30,7 @@ */ /* - * @test + * @test id=vanilla * @bug 8340093 * @summary Test vectorization of reduction loops. * @library /test/lib / @@ -38,7 +38,7 @@ */ /* - * @test + * @test id=force-vectorization * @bug 8340093 * @summary Test vectorization of reduction loops. * @library /test/lib / @@ -1903,8 +1903,12 @@ private static long longMaxBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -1923,8 +1927,12 @@ private static float floatAddSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -1978,8 +1986,12 @@ private static float floatMaxSimple() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddDotProduct() { @@ -1995,8 +2007,12 @@ private static float floatAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulDotProduct() { @@ -2047,8 +2063,12 @@ private static float floatMaxDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatAddBig() { @@ -2064,8 +2084,12 @@ private static float floatAddBig() { @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", IRNode.MUL_REDUCTION_VF, "> 0", IRNode.MUL_VF, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_F, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static float floatMulBig() { @@ -2116,8 +2140,12 @@ private static float floatMaxBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_VD, "> 0", IRNode.ADD_VD, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2136,8 +2164,12 @@ private static double doubleAddSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) // Not considered profitable by cost model, but if forced we can vectorize. @@ -2191,8 +2223,12 @@ private static double doubleMaxSimple() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VD, "= 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddDotProduct() { @@ -2208,8 +2244,12 @@ private static double doubleAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulDotProduct() { @@ -2260,8 +2300,12 @@ private static double doubleMaxDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.ADD_REDUCTION_V, "> 0", IRNode.ADD_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleAddBig() { @@ -2277,8 +2321,12 @@ private static double doubleAddBig() { @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", IRNode.MUL_REDUCTION_VD, "> 0", IRNode.MUL_VD, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. @IR(failOn = IRNode.LOAD_VECTOR_D, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static double doubleMulBig() { From c2768d865a1cda9618a1fff4e9d9239bbe9bac09 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 11:03:13 +0100 Subject: [PATCH 31/39] fix some asimd ir rules --- .../loopopts/superword/TestReductions.java | 63 ++++++++++++++++--- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index c0f068df29c9e..8f9cabeaf29d8 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1594,6 +1594,16 @@ private static long longAddSimple() { // I think this could vectorize, but currently does not. Filed: JDK-8370673 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + // TODO: it seems we support this after all on NEON? Investigate! + // Concerning: reduction is done in scalar. But we also have + // a scalar element-wise operation or MulVL ... but it is not + // recommended that it is used, see: + // Matcher::match_rule_supported_auto_vectorization + // This probably explains the slowdown we see in the benchmark! + // We should thus also revise all other occurances of MulVL. + // Maybe we also have to fix the code that moves the reduction + // out of the loop, because it seems to introduce the MulVL, + // but probably should not. private static long longMulSimple() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1648,8 +1658,11 @@ private static long longMaxSimple() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.AND_REDUCTION_V, "> 0", IRNode.AND_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndDotProduct() { @@ -1665,8 +1678,11 @@ private static long longAndDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.OR_REDUCTION_V, "> 0", IRNode.OR_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longOrDotProduct() { @@ -1684,6 +1700,9 @@ private static long longOrDotProduct() { IRNode.XOR_VL, "> 0"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longXorDotProduct() { @@ -1716,11 +1735,14 @@ private static long longAddDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MUL_REDUCTION_VL, "> 0", IRNode.MUL_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"}, + applyIfCPUFeature = {"avx512dq", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370673 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // MulVL is not implemented on NEON, so we also not have the reduction. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMulDotProduct() { @@ -1736,11 +1758,14 @@ private static long longMulDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIfCPUFeature = {"avx512", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinDotProduct() { @@ -1756,11 +1781,14 @@ private static long longMinDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIfCPUFeature = {"avx512", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxDotProduct() { @@ -1777,8 +1805,11 @@ private static long longMaxDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.AND_REDUCTION_V, "> 0", IRNode.AND_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndBig() { @@ -1794,8 +1825,11 @@ private static long longAndBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.OR_REDUCTION_V, "> 0", IRNode.OR_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longOrBig() { @@ -1811,8 +1845,11 @@ private static long longOrBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.XOR_REDUCTION_V, "> 0", IRNode.XOR_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longXorBig() { @@ -1862,11 +1899,14 @@ private static long longMulBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MIN_REDUCTION_V, "> 0", IRNode.MIN_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIfCPUFeature = {"avx512", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinBig() { @@ -1882,11 +1922,14 @@ private static long longMinBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MAX_REDUCTION_V, "> 0", IRNode.MAX_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIfCPUFeature = {"avx512", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxBig() { From 1edb758ba6820bccae887b1e020cf4988d24265c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Oct 2025 13:27:23 +0100 Subject: [PATCH 32/39] fix ir test a bit more --- .../loopopts/superword/TestReductions.java | 67 +++++++++++++------ 1 file changed, 46 insertions(+), 21 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 8f9cabeaf29d8..9cf9dc1e25b2c 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -105,7 +105,8 @@ public static void main(String[] args) { switch (args[0]) { case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); } case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); } - case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2"); } + // Note: increasing the node count limit also helps in some cases. + case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2", "-XX:LoopUnrollLimit=1000"); } default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } }; framework.start(); @@ -1594,16 +1595,9 @@ private static long longAddSimple() { // I think this could vectorize, but currently does not. Filed: JDK-8370673 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - // TODO: it seems we support this after all on NEON? Investigate! - // Concerning: reduction is done in scalar. But we also have - // a scalar element-wise operation or MulVL ... but it is not - // recommended that it is used, see: - // Matcher::match_rule_supported_auto_vectorization - // This probably explains the slowdown we see in the benchmark! - // We should thus also revise all other occurances of MulVL. - // Maybe we also have to fix the code that moves the reduction - // out of the loop, because it seems to introduce the MulVL, - // but probably should not. + // Note: we get a performance regression for NEON, because it uses a + // scalar implementation for the reduction. + // Filed: JDK-8370686 private static long longMulSimple() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1663,6 +1657,7 @@ private static long longMaxSimple() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndDotProduct() { @@ -1683,6 +1678,7 @@ private static long longAndDotProduct() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longOrDotProduct() { @@ -1698,11 +1694,12 @@ private static long longOrDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.XOR_REDUCTION_V, "> 0", IRNode.XOR_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longXorDotProduct() { @@ -1718,8 +1715,12 @@ private static long longXorDotProduct() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.ADD_REDUCTION_VL, "> 0", IRNode.ADD_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAddDotProduct() { @@ -1743,6 +1744,7 @@ private static long longAddDotProduct() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // MulVL is not implemented on NEON, so we also not have the reduction. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMulDotProduct() { @@ -1766,6 +1768,7 @@ private static long longMulDotProduct() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinDotProduct() { @@ -1789,6 +1792,7 @@ private static long longMinDotProduct() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxDotProduct() { @@ -1810,6 +1814,7 @@ private static long longMaxDotProduct() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAndBig() { @@ -1830,6 +1835,7 @@ private static long longAndBig() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longOrBig() { @@ -1850,6 +1856,7 @@ private static long longOrBig() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longXorBig() { @@ -1865,8 +1872,12 @@ private static long longXorBig() { @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.ADD_REDUCTION_VL, "> 0", IRNode.ADD_VL, "> 0"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIfCPUFeature = {"sse4.1", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longAddBig() { @@ -1879,13 +1890,25 @@ private static long longAddBig() { } @Test - //@IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", - // IRNode.MUL_REDUCTION_VL, "> 0", - // IRNode.MUL_VL, "> 0"}, - // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, - // applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) - @IR(failOn = IRNode.LOAD_VECTOR_L) - // TODO: investigate, file report / issue. + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, + applyIfCPUFeature = {"avx512dq", "true"}, + applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0", + "LoopUnrollLimit", ">= 1000"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeature = {"avx512dq", "true"}, + applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0", + "LoopUnrollLimit", "< 1000"}) + // Increasing the body limit seems to help. Filed for investigation: JDK-8370685 + // If you can eliminate this exception for LoopUnrollLimit, please remove + // the flag completely from the test, also the "addFlags" at the top. + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // MulVL is not implemented on NEON, so we also not have the reduction. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMulBig() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { @@ -1907,6 +1930,7 @@ private static long longMulBig() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMinBig() { @@ -1930,6 +1954,7 @@ private static long longMinBig() { @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"asimd", "true"}) // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. + // Filed: JDK-8370686 @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) private static long longMaxBig() { From 18a88983996cc47942f6e5c563093c865259901a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 29 Oct 2025 11:40:53 +0100 Subject: [PATCH 33/39] fix aarch64 long mul reduction perf issue --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 14 ++++++++++---- src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 14 ++++++++++---- src/hotspot/share/opto/vtransform.cpp | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 3379041b2ccac..9809d096233a3 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -129,18 +129,24 @@ source %{ bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) { if (UseSVE == 0) { // These operations are not profitable to be vectorized on NEON, because no direct - // NEON instructions support them. But the match rule support for them is profitable for - // Vector API intrinsics. + // NEON instructions support them. They use multiple instructions which is more + // expensive in almost all cases where we would auto vectorize. + // But the match rule support for them is profitable for Vector API intrinsics. if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) || (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || + opcode == Op_MulVL || // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. // They are not suitable for auto-vectorization because the result would not conform // to the JLS, Section Evaluation Order. + // Note: we could implement sequential reductions for these reduction operators, but + // this will still almost never lead to speedups, because the sequential + // reductions are latency limited along the reduction chain, and not + // throughput limited. This is unlike unordered reductions (associative op) + // and element-wise ops which are usually throughput limited. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || - opcode == Op_MulVL) { + opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { return false; } } diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 6d296cbdb3ac3..a9f42e1bc08c9 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -119,18 +119,24 @@ source %{ bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) { if (UseSVE == 0) { // These operations are not profitable to be vectorized on NEON, because no direct - // NEON instructions support them. But the match rule support for them is profitable for - // Vector API intrinsics. + // NEON instructions support them. They use multiple instructions which is more + // expensive in almost all cases where we would auto vectorize. + // But the match rule support for them is profitable for Vector API intrinsics. if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) || (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || + opcode == Op_MulVL || // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. // They are not suitable for auto-vectorization because the result would not conform // to the JLS, Section Evaluation Order. + // Note: we could implement sequential reductions for these reduction operators, but + // this will still almost never lead to speedups, because the sequential + // reductions are latency limited along the reduction chain, and not + // throughput limited. This is unlike unordered reductions (associative op) + // and element-wise ops which are usually throughput limited. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || - opcode == Op_MulVL) { + opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { return false; } } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 954a915cbaea4..c245206b609d5 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -1242,7 +1242,7 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou const BasicType bt = element_basic_type(); const int ropc = vector_reduction_opcode(); const int vopc = VectorNode::opcode(sopc, bt); - if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) { + if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) { DEBUG_ONLY( this->print(); ) assert(false, "do not have normal vector op for this reduction"); return false; // not implemented From 2bd9c94dd7d30a571e6972a8dc51e59022085b6d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 29 Oct 2025 13:44:48 +0100 Subject: [PATCH 34/39] rm assert --- src/hotspot/share/opto/vtransform.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index c245206b609d5..2cfca67e3f697 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -1243,9 +1243,9 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou const int ropc = vector_reduction_opcode(); const int vopc = VectorNode::opcode(sopc, bt); if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) { - DEBUG_ONLY( this->print(); ) - assert(false, "do not have normal vector op for this reduction"); - return false; // not implemented + // The element-wise vector operation needed for the vector accumulator + // is not implemented / supported. + return false; } // Traverse up the chain of non strict order reductions, checking that it loops From a8d31d756ea378179044b4f29ef302ba6852f4e3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 29 Oct 2025 14:55:52 +0100 Subject: [PATCH 35/39] fix IR rules for aarch64 NEON --- .../loopopts/superword/TestReductions.java | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java index 9cf9dc1e25b2c..1cd5cfa1e750c 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -1587,17 +1587,25 @@ private static long longAddSimple() { @Test @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", IRNode.MUL_REDUCTION_VL, "> 0", - IRNode.MUL_VL, "> 0"}, - applyIfCPUFeatureOr = {"avx512dq", "true", "asimd", "true"}, + IRNode.MUL_VL, "> 0"}, // vector accumulator + applyIfCPUFeature = {"avx512dq", "true"}, applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) @IR(failOn = IRNode.LOAD_VECTOR_L, applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) // I think this could vectorize, but currently does not. Filed: JDK-8370673 + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "= 0"}, // Reduction NOT moved out of loop + applyIfCPUFeatureOr = {"asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + // Note: NEON does not support MulVL for auto vectorization. There is + // a scalarized implementation, but that is not profitable for + // auto vectorization in almost all cases, and would not be + // profitable here at any rate. + // Hence, we have to keep the reduction inside the loop, and + // cannot use the MulVL as the vector accumulator. @IR(failOn = IRNode.LOAD_VECTOR_L, applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) - // Note: we get a performance regression for NEON, because it uses a - // scalar implementation for the reduction. - // Filed: JDK-8370686 private static long longMulSimple() { long acc = 1; // neutral element for (int i = 0; i < SIZE; i++) { From 3f7ef58ef15f1eac11dbcff85d5fbc830ecd715e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 29 Oct 2025 15:09:10 +0100 Subject: [PATCH 36/39] simplify cost-model impl --- src/hotspot/share/opto/matcher.cpp | 27 ------------------------ src/hotspot/share/opto/matcher.hpp | 5 ----- src/hotspot/share/opto/vectorization.cpp | 17 ++++++++++++--- 3 files changed, 14 insertions(+), 35 deletions(-) diff --git a/src/hotspot/share/opto/matcher.cpp b/src/hotspot/share/opto/matcher.cpp index 3d090210de517..c63cefe7ac201 100644 --- a/src/hotspot/share/opto/matcher.cpp +++ b/src/hotspot/share/opto/matcher.cpp @@ -2678,33 +2678,6 @@ void Matcher::specialize_generic_vector_operands() { } } -// For now, we use unit cost. We might refine that in the future. -// If needed, we could also use platform specific costs, if the -// default here is not accurate enough. -float Matcher::cost_for_scalar(int opcode) { - return 1; -} - -// For now, we use unit cost. We might refine that in the future. -// If needed, we could also use platform specific costs, if the -// default here is not accurate enough. -float Matcher::cost_for_vector(int opcode, int vlen, BasicType bt) { - return 1; -} - -// For now, we use unit cost. We might refine that in the future. -// If needed, we could also use platform specific costs, if the -// default here is not accurate enough. -float Matcher::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) { - if (requires_strict_order) { - // Linear: shuffle and reduce - return 2 * vlen; - } else { - // Recursive: shuffle and reduce - return 2 * exact_log2(vlen); - } -} - uint Matcher::vector_length(const Node* n) { const TypeVect* vt = n->bottom_type()->is_vect(); return vt->length(); diff --git a/src/hotspot/share/opto/matcher.hpp b/src/hotspot/share/opto/matcher.hpp index 42e75e6db0182..e4396b423ac0e 100644 --- a/src/hotspot/share/opto/matcher.hpp +++ b/src/hotspot/share/opto/matcher.hpp @@ -333,11 +333,6 @@ class Matcher : public PhaseTransform { static bool vector_rearrange_requires_load_shuffle(BasicType elem_bt, int vlen); - // Cost-Model for Auto-Vectorization - static float cost_for_scalar(int opcode); - static float cost_for_vector(int opcode, int vlen, BasicType bt); - static float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order); - static const RegMask* predicate_reg_mask(void); // Vector width in bytes diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 15e4248cf409a..d071781e11e9e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -598,8 +598,11 @@ float VLoopAnalyzer::cost() const { return sum; } +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. float VLoopAnalyzer::cost_for_scalar(int opcode) const { - float c = Matcher::cost_for_scalar(opcode); + float c = 1; #ifndef PRODUCT if (_vloop.is_trace_cost()) { tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); @@ -608,8 +611,11 @@ float VLoopAnalyzer::cost_for_scalar(int opcode) const { return c; } +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { - float c = Matcher::cost_for_vector(opcode, vlen, bt); + float c = 1; #ifndef PRODUCT if (_vloop.is_trace_cost()) { tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", @@ -619,8 +625,13 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { return c; } +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { - float c = Matcher::cost_for_vector_reduction(opcode, vlen, bt, requires_strict_order); + // Each reduction is composed of multiple instructions, each estimated with a unit cost. + // Linear: shuffle and reduce Recursive: shuffle and reduce + float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen); #ifndef PRODUCT if (_vloop.is_trace_cost()) { tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", From 22dab5a4d0a07378b0718dd67e5e470cc37372e9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 3 Nov 2025 14:56:06 +0100 Subject: [PATCH 37/39] Update src/hotspot/share/opto/vectorization.cpp Co-authored-by: Hannes Greule --- src/hotspot/share/opto/vectorization.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index d071781e11e9e..e231515166173 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -541,7 +541,7 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -// Cost-model heuristic for nodes that do not contribute to computatinal +// Cost-model heuristic for nodes that do not contribute to computational // cost inside the loop. bool VLoopAnalyzer::has_zero_cost(Node* n) const { // Outside body? From d79df4fce4c0464bb97b7b66a5ee490a832f6230 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 3 Nov 2025 17:09:21 +0100 Subject: [PATCH 38/39] More comments for SirYwell --- src/hotspot/share/opto/vectorization.cpp | 3 ++- src/hotspot/share/opto/vtransform.cpp | 4 ++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index e231515166173..ef83358719657 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -625,7 +625,8 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { return c; } -// For now, we use unit cost. We might refine that in the future. +// For now, we use unit cost, i.e. we count the number of backend instructions +// that the vtnode will use. We might refine that in the future. // If needed, we could also use platform specific costs, if the // default here is not accurate enough. float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 2cfca67e3f697..379768d8172bf 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -249,6 +249,10 @@ float VTransformGraph::cost() const { } #endif + // We only want to count the cost of nodes that are in the loop. + // This is especially important for cases where we were able to move + // some nodes outside the loop during VTransform::optimize, e.g.: + // VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop ResourceMark rm; VectorSet in_loop; // vtn->_idx -> bool mark_vtnodes_in_loop(in_loop); From 23906b814a01d910c089eb06e0733a5222fb4eb1 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 5 Nov 2025 10:45:28 +0100 Subject: [PATCH 39/39] rename cost methods for Vladimir K --- src/hotspot/share/opto/superword.cpp | 4 +-- src/hotspot/share/opto/vectorization.cpp | 12 ++++----- src/hotspot/share/opto/vectorization.hpp | 8 +++--- src/hotspot/share/opto/vtransform.cpp | 34 ++++++++++++------------ src/hotspot/share/opto/vtransform.hpp | 4 +-- 5 files changed, 31 insertions(+), 31 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 4af48667c3bc7..dfac5240b504f 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1930,8 +1930,8 @@ bool VTransform::is_profitable() const { if (has_store_to_load_forwarding_failure()) { return false; } // Cost-model - float scalar_cost = _vloop_analyzer.cost(); - float vector_cost = cost(); + float scalar_cost = _vloop_analyzer.cost_for_scalar_loop(); + float vector_cost = cost_for_vector_loop(); #ifndef PRODUCT if (_trace._info) { tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f", diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index ef83358719657..98f3d79c9f5ce 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -569,10 +569,10 @@ bool VLoopAnalyzer::has_zero_cost(Node* n) const { } // Compute the cost over all operations in the (scalar) loop. -float VLoopAnalyzer::cost() const { +float VLoopAnalyzer::cost_for_scalar_loop() const { #ifndef PRODUCT if (_vloop.is_trace_cost()) { - tty->print_cr("\nVLoopAnalyzer::cost:"); + tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:"); } #endif @@ -580,7 +580,7 @@ float VLoopAnalyzer::cost() const { for (int j = 0; j < body().body().length(); j++) { Node* n = body().body().at(j); if (!has_zero_cost(n)) { - float c = cost_for_scalar(n->Opcode()); + float c = cost_for_scalar_node(n->Opcode()); sum += c; #ifndef PRODUCT if (_vloop.is_trace_cost_verbose()) { @@ -601,7 +601,7 @@ float VLoopAnalyzer::cost() const { // For now, we use unit cost. We might refine that in the future. // If needed, we could also use platform specific costs, if the // default here is not accurate enough. -float VLoopAnalyzer::cost_for_scalar(int opcode) const { +float VLoopAnalyzer::cost_for_scalar_node(int opcode) const { float c = 1; #ifndef PRODUCT if (_vloop.is_trace_cost()) { @@ -614,7 +614,7 @@ float VLoopAnalyzer::cost_for_scalar(int opcode) const { // For now, we use unit cost. We might refine that in the future. // If needed, we could also use platform specific costs, if the // default here is not accurate enough. -float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { +float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const { float c = 1; #ifndef PRODUCT if (_vloop.is_trace_cost()) { @@ -629,7 +629,7 @@ float VLoopAnalyzer::cost_for_vector(int opcode, int vlen, BasicType bt) const { // that the vtnode will use. We might refine that in the future. // If needed, we could also use platform specific costs, if the // default here is not accurate enough. -float VLoopAnalyzer::cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { +float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { // Each reduction is composed of multiple instructions, each estimated with a unit cost. // Linear: shuffle and reduce Recursive: shuffle and reduce float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 419c29d6544a5..f7099b5b7c0a4 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -853,13 +853,13 @@ class VLoopAnalyzer : StackObj { const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; } // Compute the cost of the (scalar) body. - float cost() const; + float cost_for_scalar_loop() const; bool has_zero_cost(Node* n) const; // Cost-modeling with tracing. - float cost_for_scalar(int opcode) const; - float cost_for_vector(int opcode, int vlen, BasicType bt) const; - float cost_for_vector_reduction(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; + float cost_for_scalar_node(int opcode) const; + float cost_for_vector_node(int opcode, int vlen, BasicType bt) const; + float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; private: bool setup_submodules(); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 379768d8172bf..9fd6ad1089c55 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -241,11 +241,11 @@ void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { } } -float VTransformGraph::cost() const { +float VTransformGraph::cost_for_vector_loop() const { assert(is_scheduled(), "must already be scheduled"); #ifndef PRODUCT if (_vloop.is_trace_cost()) { - tty->print_cr("\nVTransformGraph::cost:"); + tty->print_cr("\nVTransformGraph::cost_for_vector_loop:"); } #endif @@ -927,7 +927,7 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { // This is an identity transform, but loads and stores must be counted. assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted"); - return vloop_analyzer.cost_for_scalar(_node->Opcode()); + return vloop_analyzer.cost_for_scalar_node(_node->Opcode()); } VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const { @@ -948,7 +948,7 @@ float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const if (vloop_analyzer.has_zero_cost(_node)) { return 0; } else { - return vloop_analyzer.cost_for_scalar(_node->Opcode()); + return vloop_analyzer.cost_for_scalar_node(_node->Opcode()); } } @@ -1005,7 +1005,7 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta } float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_vector(Op_Replicate, _vlen, _element_type); + return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type); } VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const { @@ -1016,7 +1016,7 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply } float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_scalar(Op_ConvI2L); + return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L); } VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const { @@ -1028,8 +1028,8 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const { int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode); - return vloop_analyzer.cost_for_scalar(Op_AndI) + - vloop_analyzer.cost_for_vector(shift_count_opc, _vlen, _element_bt); + return vloop_analyzer.cost_for_scalar_node(Op_AndI) + + vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt); } VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const { @@ -1048,7 +1048,7 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl } float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_vector(Op_PopulateIndex, _vlen, _element_bt); + return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt); } VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const { @@ -1063,7 +1063,7 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a } float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_vector(_vector_opcode, vector_length(), element_basic_type()); + return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type()); } VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1086,8 +1086,8 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type()); - return vloop_analyzer.cost_for_vector(vopc, vector_length(), element_basic_type()) + - vloop_analyzer.cost_for_vector(Op_VectorCastL2X, vector_length(), T_INT); + return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) + + vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT); } VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1106,7 +1106,7 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply( } float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { - return vloop_analyzer.cost_for_vector(Op_VectorReinterpret, vector_length(), element_basic_type()); + return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type()); } VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1123,7 +1123,7 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { assert(scalar_opcode() == Op_Bool, ""); - return vloop_analyzer.cost_for_vector(Op_VectorMaskCmp, vector_length(), element_basic_type()); + return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type()); } VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1386,7 +1386,7 @@ float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) c BasicType bt = element_basic_type(); int vopc = vector_reduction_opcode(); bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc); - return vloop_analyzer.cost_for_vector_reduction(vopc, vlen, bt, requires_strict_order); + return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order); } VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1401,7 +1401,7 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { uint vlen = vector_length(); BasicType bt = element_basic_type(); - return vloop_analyzer.cost_for_vector(Op_LoadVector, vlen, bt); + return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt); } VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const { @@ -1436,7 +1436,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { uint vlen = vector_length(); BasicType bt = element_basic_type(); - return vloop_analyzer.cost_for_vector(Op_StoreVector, vlen, bt); + return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt); } VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const { diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index a887300806ce9..a30f0ff098faf 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -194,7 +194,7 @@ class VTransformGraph : public StackObj { void optimize(VTransform& vtransform); bool schedule(); bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const; - float cost() const; + float cost_for_vector_loop() const; void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; private: @@ -259,7 +259,7 @@ class VTransform : public StackObj { void optimize() { return _graph.optimize(*this); } bool schedule() { return _graph.schedule(); } bool is_profitable() const; - float cost() const { return _graph.cost(); } + float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); } bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply();