diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index 3379041b2ccac..9809d096233a3 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -129,18 +129,24 @@ source %{ bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) { if (UseSVE == 0) { // These operations are not profitable to be vectorized on NEON, because no direct - // NEON instructions support them. But the match rule support for them is profitable for - // Vector API intrinsics. + // NEON instructions support them. They use multiple instructions which is more + // expensive in almost all cases where we would auto vectorize. + // But the match rule support for them is profitable for Vector API intrinsics. if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) || (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || + opcode == Op_MulVL || // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. // They are not suitable for auto-vectorization because the result would not conform // to the JLS, Section Evaluation Order. + // Note: we could implement sequential reductions for these reduction operators, but + // this will still almost never lead to speedups, because the sequential + // reductions are latency limited along the reduction chain, and not + // throughput limited. This is unlike unordered reductions (associative op) + // and element-wise ops which are usually throughput limited. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || - opcode == Op_MulVL) { + opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { return false; } } diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 6d296cbdb3ac3..a9f42e1bc08c9 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -119,18 +119,24 @@ source %{ bool Matcher::match_rule_supported_auto_vectorization(int opcode, int vlen, BasicType bt) { if (UseSVE == 0) { // These operations are not profitable to be vectorized on NEON, because no direct - // NEON instructions support them. But the match rule support for them is profitable for - // Vector API intrinsics. + // NEON instructions support them. They use multiple instructions which is more + // expensive in almost all cases where we would auto vectorize. + // But the match rule support for them is profitable for Vector API intrinsics. if ((opcode == Op_VectorCastD2X && (bt == T_INT || bt == T_SHORT)) || (opcode == Op_VectorCastL2X && bt == T_FLOAT) || (opcode == Op_CountLeadingZerosV && bt == T_LONG) || (opcode == Op_CountTrailingZerosV && bt == T_LONG) || + opcode == Op_MulVL || // The implementations of Op_AddReductionVD/F in Neon are for the Vector API only. // They are not suitable for auto-vectorization because the result would not conform // to the JLS, Section Evaluation Order. + // Note: we could implement sequential reductions for these reduction operators, but + // this will still almost never lead to speedups, because the sequential + // reductions are latency limited along the reduction chain, and not + // throughput limited. This is unlike unordered reductions (associative op) + // and element-wise ops which are usually throughput limited. opcode == Op_AddReductionVD || opcode == Op_AddReductionVF || - opcode == Op_MulReductionVD || opcode == Op_MulReductionVF || - opcode == Op_MulVL) { + opcode == Op_MulReductionVD || opcode == Op_MulReductionVF) { return false; } } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 6ab1ff37de9fd..dfac5240b504f 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -42,9 +42,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : ), _vpointer_for_main_loop_alignment(nullptr), _aw_for_main_loop_alignment(0), - _do_vector_loop(phase()->C->do_vector_loop()), // whether to do vectorization/simd style - _num_work_vecs(0), // amount of vector work we have - _num_reductions(0) // amount of reduction work we have + _do_vector_loop(phase()->C->do_vector_loop()) // whether to do vectorization/simd style { } @@ -1567,18 +1565,6 @@ void SuperWord::filter_packs_for_implemented() { // Remove packs that are not profitable. void SuperWord::filter_packs_for_profitable() { - // Count the number of reductions vs other vector ops, for the - // reduction profitability heuristic. - for (int i = 0; i < _packset.length(); i++) { - Node_List* pack = _packset.at(i); - Node* n = pack->at(0); - if (is_marked_reduction(n)) { - _num_reductions++; - } else { - _num_work_vecs++; - } - } - // Remove packs that are not profitable auto filter = [&](const Node_List* pack) { return profitable(pack); @@ -1595,31 +1581,7 @@ bool SuperWord::implemented(const Node_List* pack, const uint size) const { if (p0 != nullptr) { int opc = p0->Opcode(); if (is_marked_reduction(p0)) { - const Type *arith_type = p0->bottom_type(); - // This heuristic predicts that 2-element reductions for INT/LONG are not - // profitable. This heuristic was added in JDK-8078563. The argument - // was that reductions are not just a single instruction, but multiple, and - // hence it is not directly clear that they are profitable. If we only have - // two elements per vector, then the performance gains from non-reduction - // vectors are at most going from 2 scalar instructions to 1 vector instruction. - // But a 2-element reduction vector goes from 2 scalar instructions to - // 3 instructions (1 shuffle and two reduction ops). - // However, this optimization assumes that these reductions stay in the loop - // which may not be true any more in most cases after the introduction of: - // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop - // Hence, this heuristic has room for improvement. - bool is_two_element_int_or_long_reduction = (size == 2) && - (arith_type->basic_type() == T_INT || - arith_type->basic_type() == T_LONG); - if (is_two_element_int_or_long_reduction && AutoVectorizationOverrideProfitability != 2) { -#ifndef PRODUCT - if (is_trace_superword_rejections()) { - tty->print_cr("\nPerformance heuristic: 2-element INT/LONG reduction not profitable."); - tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); - } -#endif - return false; - } + const Type* arith_type = p0->bottom_type(); retValue = ReductionNode::implemented(opc, size, arith_type->basic_type()); } else if (VectorNode::is_convert_opcode(opc)) { retValue = VectorCastNode::implemented(opc, size, velt_basic_type(p0->in(1)), velt_basic_type(p0)); @@ -1772,26 +1734,6 @@ bool SuperWord::profitable(const Node_List* p) const { // The second input has to be the vector we wanted to reduce, // but it was not packed. return false; - } else if (_num_work_vecs == _num_reductions && AutoVectorizationOverrideProfitability != 2) { - // This heuristic predicts that the reduction is not profitable. - // Reduction vectors can be expensive, because they require multiple - // operations to fold all the lanes together. Hence, vectorizing the - // reduction is not profitable on its own. Hence, we need a lot of - // other "work vectors" that deliver performance improvements to - // balance out the performance loss due to reductions. - // This heuristic is a bit simplistic, and assumes that the reduction - // vector stays in the loop. But in some cases, we can move the - // reduction out of the loop, replacing it with a single vector op. - // See: VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop - // Hence, this heuristic has room for improvement. -#ifndef PRODUCT - if (is_trace_superword_rejections()) { - tty->print_cr("\nPerformance heuristic: not enough vectors in the loop to make"); - tty->print_cr(" reduction profitable."); - tty->print_cr(" Can override with AutoVectorizationOverrideProfitability=2"); - } -#endif - return false; } else if (second_pk->size() != p->size()) { return false; } @@ -1950,19 +1892,53 @@ bool SuperWord::do_vtransform() const { vtransform.optimize(); if (!vtransform.schedule()) { return false; } - if (vtransform.has_store_to_load_forwarding_failure()) { return false; } + + if (!vtransform.is_profitable()) { return false; } + + vtransform.apply(); + return true; +} + +// Check Cost-Model, and other heuristics. +// Can be overridden with AutoVectorizationOverrideProfitability. +bool VTransform::is_profitable() const { + assert(_graph.is_scheduled(), "must already be scheduled"); if (AutoVectorizationOverrideProfitability == 0) { #ifndef PRODUCT - if (is_trace_superword_any()) { + if (_trace._info) { tty->print_cr("\nForced bailout of vectorization (AutoVectorizationOverrideProfitability=0)."); } #endif return false; } - vtransform.apply(); - return true; + if (AutoVectorizationOverrideProfitability == 2) { +#ifndef PRODUCT + if (_trace._info) { + tty->print_cr("\nForced vectorization, ignoring profitability (AutoVectorizationOverrideProfitability=2)."); + } +#endif + return true; + } + + // Note: currently we only do throughput-based cost-modeling. In the future, we could + // also implement latency-based cost-modeling and take store-to-load-forwarding + // failures into account as the latency between the load and store. This would + // allow a more precise tradeoff between the forwarding failure penalty versus + // the vectorization gains. + if (has_store_to_load_forwarding_failure()) { return false; } + + // Cost-model + float scalar_cost = _vloop_analyzer.cost_for_scalar_loop(); + float vector_cost = cost_for_vector_loop(); +#ifndef PRODUCT + if (_trace._info) { + tty->print_cr("\nVTransform: scalar_cost = %.2f vs vector_cost = %.2f", + scalar_cost, vector_cost); + } +#endif + return vector_cost < scalar_cost; } // Apply the vectorization, i.e. we irreversibly edit the C2 graph. At this point, all diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 118e0aa042c79..9654465220b9c 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -549,8 +549,6 @@ class SuperWord : public ResourceObj { private: bool _do_vector_loop; // whether to do vectorization/simd style - int _num_work_vecs; // Number of non memory vector operations - int _num_reductions; // Number of reduction expressions applied // Accessors Arena* arena() { return &_arena; } diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index d996173aeb43b..4f67aff9b0706 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -38,7 +38,7 @@ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ flags(BODY, "Trace VLoopBody") \ flags(TYPES, "Trace VLoopTypes") \ - flags(POINTERS, "Trace VLoopPointers") \ + flags(POINTERS, "Trace VLoopVPointers") \ flags(DEPENDENCY_GRAPH, "Trace VLoopDependencyGraph") \ flags(SW_ADJACENT_MEMOPS, "Trace SuperWord::find_adjacent_memop_pairs") \ flags(SW_REJECTIONS, "Trace SuperWord rejections (non vectorizations)") \ @@ -47,6 +47,8 @@ flags(SW_VERBOSE, "Trace SuperWord verbose (all SW tags enabled)") \ flags(VTRANSFORM, "Trace VTransform Graph") \ flags(OPTIMIZATION, "Trace VTransform::optimize") \ + flags(COST, "Trace cost of VLoop (scalar) and VTransform (vector)") \ + flags(COST_VERBOSE, "Trace like COST, but more verbose") \ flags(ALIGN_VECTOR, "Trace AlignVector") \ flags(SPECULATIVE_ALIASING_ANALYSIS, "Trace Speculative Aliasing Analysis") \ flags(SPECULATIVE_RUNTIME_CHECKS, "Trace VTransform::apply_speculative_runtime_checks") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 5c4e15fdbb916..98f3d79c9f5ce 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -287,7 +287,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); + ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, _pointer_expression_nodes); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; }); @@ -541,6 +541,108 @@ void VLoopDependencyGraph::PredsIterator::next() { } } +// Cost-model heuristic for nodes that do not contribute to computational +// cost inside the loop. +bool VLoopAnalyzer::has_zero_cost(Node* n) const { + // Outside body? + if (!_vloop.in_bb(n)) { return true; } + + // Internal nodes of pointer expressions are most likely folded into + // the load / store and have no additional cost. + if (vpointers().is_in_pointer_expression(n)) { return true; } + + // Not all AddP nodes can be detected in VPointer parsing, so + // we filter them out here. + // We don't want to explicitly model the cost of control flow, + // since we have the same CFG structure before and after + // vectorization: A loop head, a loop exit, with a backedge. + if (n->is_AddP() || // Pointer expression + n->is_CFG() || // CFG + n->is_Phi() || // CFG + n->is_Cmp() || // CFG + n->is_Bool()) { // CFG + return true; + } + + // All other nodes have a non-zero cost. + return false; +} + +// Compute the cost over all operations in the (scalar) loop. +float VLoopAnalyzer::cost_for_scalar_loop() const { +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr("\nVLoopAnalyzer::cost_for_scalar_loop:"); + } +#endif + + float sum = 0; + for (int j = 0; j < body().body().length(); j++) { + Node* n = body().body().at(j); + if (!has_zero_cost(n)) { + float c = cost_for_scalar_node(n->Opcode()); + sum += c; +#ifndef PRODUCT + if (_vloop.is_trace_cost_verbose()) { + tty->print_cr(" -> cost = %.2f for %d %s", c, n->_idx, n->Name()); + } +#endif + } + } + +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" total_cost = %.2f", sum); + } +#endif + return sum; +} + +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float VLoopAnalyzer::cost_for_scalar_node(int opcode) const { + float c = 1; +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s", c, NodeClassNames[opcode]); + } +#endif + return c; +} + +// For now, we use unit cost. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float VLoopAnalyzer::cost_for_vector_node(int opcode, int vlen, BasicType bt) const { + float c = 1; +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s", + c, NodeClassNames[opcode], vlen, type2name(bt)); + } +#endif + return c; +} + +// For now, we use unit cost, i.e. we count the number of backend instructions +// that the vtnode will use. We might refine that in the future. +// If needed, we could also use platform specific costs, if the +// default here is not accurate enough. +float VLoopAnalyzer::cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const { + // Each reduction is composed of multiple instructions, each estimated with a unit cost. + // Linear: shuffle and reduce Recursive: shuffle and reduce + float c = requires_strict_order ? 2 * vlen : 2 * exact_log2(vlen); +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" cost = %.2f opc=%s vlen=%d bt=%s requires_strict_order=%s", + c, NodeClassNames[opcode], vlen, type2name(bt), + requires_strict_order ? "true" : "false"); + } +#endif + return c; +} + // Computing aliasing runtime check using init and last of main-loop // ----------------------------------------------------------------- // diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b1be52d531a51..f7099b5b7c0a4 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -209,6 +209,14 @@ class VLoop : public StackObj { return _vtrace.is_trace(TraceAutoVectorizationTag::OPTIMIZATION); } + bool is_trace_cost() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::COST); + } + + bool is_trace_cost_verbose() const { + return _vtrace.is_trace(TraceAutoVectorizationTag::COST_VERBOSE); + } + bool is_trace_speculative_runtime_checks() const { return _vtrace.is_trace(TraceAutoVectorizationTag::SPECULATIVE_RUNTIME_CHECKS); } @@ -584,6 +592,32 @@ class VLoopTypes : public StackObj { const Type* container_type(Node* n) const; }; +// Mark all nodes from the loop that are part of any VPointer expression. +class PointerExpressionNodes : public MemPointerParserCallback { +private: + const VLoop& _vloop; + const VLoopBody& _body; + VectorSet _in_pointer_expression; + +public: + PointerExpressionNodes(Arena* arena, + const VLoop& vloop, + const VLoopBody& body) : + _vloop(vloop), + _body(body), + _in_pointer_expression(arena) {} + + virtual void callback(Node* n) override { + if (!_vloop.in_bb(n)) { return; } + _in_pointer_expression.set(_body.bb_idx(n)); + } + + bool contains(const Node* n) const { + if (!_vloop.in_bb(n)) { return false; } + return _in_pointer_expression.test(_body.bb_idx(n)); + } +}; + // Submodule of VLoopAnalyzer. // We compute and cache the VPointer for every load and store. class VLoopVPointers : public StackObj { @@ -599,6 +633,9 @@ class VLoopVPointers : public StackObj { // Map bb_idx -> index in _vpointers. -1 if not mapped. GrowableArray _bb_idx_to_vpointer; + // Mark all nodes that are part of any pointers expression. + PointerExpressionNodes _pointer_expression_nodes; + public: VLoopVPointers(Arena* arena, const VLoop& vloop, @@ -610,13 +647,18 @@ class VLoopVPointers : public StackObj { _bb_idx_to_vpointer(arena, vloop.estimated_body_length(), vloop.estimated_body_length(), - -1) {} + -1), + _pointer_expression_nodes(arena, _vloop, _body) {} NONCOPYABLE(VLoopVPointers); void compute_vpointers(); const VPointer& vpointer(const MemNode* mem) const; NOT_PRODUCT( void print() const; ) + bool is_in_pointer_expression(const Node* n) const { + return _pointer_expression_nodes.contains(n); + } + private: void count_vpointers(); void allocate_vpointers_array(); @@ -810,6 +852,15 @@ class VLoopAnalyzer : StackObj { const VLoopVPointers& vpointers() const { return _vpointers; } const VLoopDependencyGraph& dependency_graph() const { return _dependency_graph; } + // Compute the cost of the (scalar) body. + float cost_for_scalar_loop() const; + bool has_zero_cost(Node* n) const; + + // Cost-modeling with tracing. + float cost_for_scalar_node(int opcode) const; + float cost_for_vector_node(int opcode, int vlen, BasicType bt) const; + float cost_for_vector_reduction_node(int opcode, int vlen, BasicType bt, bool requires_strict_order) const; + private: bool setup_submodules(); VStatus setup_submodules_helper(); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 46e8f43cb657d..9fd6ad1089c55 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -186,6 +186,99 @@ int VTransformGraph::count_alive_vtnodes() const { return count; } +// Find all nodes that in the loop, in a 2-phase process: +// - First, find all nodes that are not before the loop: +// - loop-phis +// - loads and stores that are in the loop +// - and all their transitive uses. +// - Second, we find all nodes that are not after the loop: +// - backedges +// - loads and stores that are in the loop +// - and all their transitive uses. +// +// in_loop: vtn->_idx -> bool +void VTransformGraph::mark_vtnodes_in_loop(VectorSet& in_loop) const { + assert(is_scheduled(), "must already be scheduled"); + + // Phase 1: find all nodes that are not before the loop. + VectorSet is_not_before_loop; + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + // Is vtn a loop-phi? + if (vtn->isa_LoopPhi() != nullptr || + vtn->is_load_or_store_in_loop()) { + is_not_before_loop.set(vtn->_idx); + continue; + } + // Or one of its transitive uses? + for (uint j = 0; j < vtn->req(); j++) { + VTransformNode* def = vtn->in_req(j); + if (def != nullptr && is_not_before_loop.test(def->_idx)) { + is_not_before_loop.set(vtn->_idx); + break; + } + } + } + + // Phase 2: find all nodes that are not after the loop. + for (int i = _schedule.length()-1; i >= 0; i--) { + VTransformNode* vtn = _schedule.at(i); + if (!is_not_before_loop.test(vtn->_idx)) { continue; } + // Is load or store? + if (vtn->is_load_or_store_in_loop()) { + in_loop.set(vtn->_idx); + continue; + } + for (uint i = 0; i < vtn->out_strong_edges(); i++) { + VTransformNode* use = vtn->out_strong_edge(i); + // Or is vtn a backedge or one of its transitive defs? + if (in_loop.test(use->_idx) || + use->isa_LoopPhi() != nullptr) { + in_loop.set(vtn->_idx); + break; + } + } + } +} + +float VTransformGraph::cost_for_vector_loop() const { + assert(is_scheduled(), "must already be scheduled"); +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr("\nVTransformGraph::cost_for_vector_loop:"); + } +#endif + + // We only want to count the cost of nodes that are in the loop. + // This is especially important for cases where we were able to move + // some nodes outside the loop during VTransform::optimize, e.g.: + // VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_out_of_loop + ResourceMark rm; + VectorSet in_loop; // vtn->_idx -> bool + mark_vtnodes_in_loop(in_loop); + + float sum = 0; + for (int i = 0; i < _schedule.length(); i++) { + VTransformNode* vtn = _schedule.at(i); + if (!in_loop.test(vtn->_idx)) { continue; } + float c = vtn->cost(_vloop_analyzer); + sum += c; +#ifndef PRODUCT + if (c != 0 && _vloop.is_trace_cost_verbose()) { + tty->print(" -> cost = %.2f for ", c); + vtn->print(); + } +#endif + } + +#ifndef PRODUCT + if (_vloop.is_trace_cost()) { + tty->print_cr(" total_cost = %.2f", sum); + } +#endif + return sum; +} + #ifndef PRODUCT void VTransformGraph::trace_schedule_cycle(const GrowableArray& stack, const VectorSet& pre_visited, @@ -831,6 +924,12 @@ void VTransformNode::apply_vtn_inputs_to_node(Node* n, VTransformApplyState& app } } +float VTransformMemopScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + // This is an identity transform, but loads and stores must be counted. + assert(!vloop_analyzer.has_zero_cost(_node), "memop nodes must be counted"); + return vloop_analyzer.cost_for_scalar_node(_node->Opcode()); +} + VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& apply_state) const { apply_vtn_inputs_to_node(_node, apply_state); // The memory state has to be applied separately: the vtn does not hold it. This allows reordering. @@ -843,6 +942,16 @@ VTransformApplyResult VTransformMemopScalarNode::apply(VTransformApplyState& app return VTransformApplyResult::make_scalar(_node); } +float VTransformDataScalarNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + // Since this is an identity transform, we may have nodes that also + // VLoopAnalyzer::cost does not count for the scalar loop. + if (vloop_analyzer.has_zero_cost(_node)) { + return 0; + } else { + return vloop_analyzer.cost_for_scalar_node(_node->Opcode()); + } +} + VTransformApplyResult VTransformDataScalarNode::apply(VTransformApplyState& apply_state) const { apply_vtn_inputs_to_node(_node, apply_state); return VTransformApplyResult::make_scalar(_node); @@ -895,6 +1004,10 @@ VTransformApplyResult VTransformOuterNode::apply(VTransformApplyState& apply_sta return VTransformApplyResult::make_scalar(_node); } +float VTransformReplicateNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector_node(Op_Replicate, _vlen, _element_type); +} + VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply_state) const { Node* val = apply_state.transformed_node(in_req(1)); VectorNode* vn = VectorNode::scalar2vector(val, _vlen, _element_type); @@ -902,6 +1015,10 @@ VTransformApplyResult VTransformReplicateNode::apply(VTransformApplyState& apply return VTransformApplyResult::make_vector(vn); } +float VTransformConvI2LNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_scalar_node(Op_ConvI2L); +} + VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_state) const { Node* val = apply_state.transformed_node(in_req(1)); Node* n = new ConvI2LNode(val); @@ -909,6 +1026,12 @@ VTransformApplyResult VTransformConvI2LNode::apply(VTransformApplyState& apply_s return VTransformApplyResult::make_scalar(n); } +float VTransformShiftCountNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + int shift_count_opc = VectorNode::shift_count_opcode(_shift_opcode); + return vloop_analyzer.cost_for_scalar_node(Op_AndI) + + vloop_analyzer.cost_for_vector_node(shift_count_opc, _vlen, _element_bt); +} + VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& apply_state) const { PhaseIdealLoop* phase = apply_state.phase(); Node* shift_count_in = apply_state.transformed_node(in_req(1)); @@ -924,6 +1047,9 @@ VTransformApplyResult VTransformShiftCountNode::apply(VTransformApplyState& appl return VTransformApplyResult::make_vector(vn); } +float VTransformPopulateIndexNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector_node(Op_PopulateIndex, _vlen, _element_bt); +} VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& apply_state) const { PhaseIdealLoop* phase = apply_state.phase(); @@ -936,6 +1062,10 @@ VTransformApplyResult VTransformPopulateIndexNode::apply(VTransformApplyState& a return VTransformApplyResult::make_vector(vn); } +float VTransformElementWiseVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector_node(_vector_opcode, vector_length(), element_basic_type()); +} + VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyState& apply_state) const { assert(2 <= req() && req() <= 4, "Must have 1-3 inputs"); const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length()); @@ -954,6 +1084,12 @@ VTransformApplyResult VTransformElementWiseVectorNode::apply(VTransformApplyStat return VTransformApplyResult::make_vector(vn); } +float VTransformElementWiseLongOpWithCastToIntVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + int vopc = VectorNode::opcode(scalar_opcode(), element_basic_type()); + return vloop_analyzer.cost_for_vector_node(vopc, vector_length(), element_basic_type()) + + vloop_analyzer.cost_for_vector_node(Op_VectorCastL2X, vector_length(), T_INT); +} + VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply(VTransformApplyState& apply_state) const { uint vlen = vector_length(); int sopc = scalar_opcode(); @@ -969,6 +1105,10 @@ VTransformApplyResult VTransformElementWiseLongOpWithCastToIntVectorNode::apply( return VTransformApplyResult::make_vector(vn); } +float VTransformReinterpretVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + return vloop_analyzer.cost_for_vector_node(Op_VectorReinterpret, vector_length(), element_basic_type()); +} + VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyState& apply_state) const { const TypeVect* dst_vt = TypeVect::make(element_basic_type(), vector_length()); const TypeVect* src_vt = TypeVect::make(_src_bt, vector_length()); @@ -981,6 +1121,11 @@ VTransformApplyResult VTransformReinterpretVectorNode::apply(VTransformApplyStat return VTransformApplyResult::make_vector(vn); } +float VTransformBoolVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + assert(scalar_opcode() == Op_Bool, ""); + return vloop_analyzer.cost_for_vector_node(Op_VectorMaskCmp, vector_length(), element_basic_type()); +} + VTransformApplyResult VTransformBoolVectorNode::apply(VTransformApplyState& apply_state) const { const TypeVect* vt = TypeVect::make(element_basic_type(), vector_length()); assert(scalar_opcode() == Op_Bool, ""); @@ -1101,10 +1246,10 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou const BasicType bt = element_basic_type(); const int ropc = vector_reduction_opcode(); const int vopc = VectorNode::opcode(sopc, bt); - if (!Matcher::match_rule_supported_vector(vopc, vlen, bt)) { - DEBUG_ONLY( this->print(); ) - assert(false, "do not have normal vector op for this reduction"); - return false; // not implemented + if (!Matcher::match_rule_supported_auto_vectorization(vopc, vlen, bt)) { + // The element-wise vector operation needed for the vector accumulator + // is not implemented / supported. + return false; } // Traverse up the chain of non strict order reductions, checking that it loops @@ -1236,6 +1381,14 @@ bool VTransformReductionVectorNode::optimize_move_non_strict_order_reductions_ou return true; // success } +float VTransformReductionVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + int vopc = vector_reduction_opcode(); + bool requires_strict_order = ReductionNode::auto_vectorization_requires_strict_order(vopc); + return vloop_analyzer.cost_for_vector_reduction_node(vopc, vlen, bt, requires_strict_order); +} + VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& apply_state) const { Node* init = apply_state.transformed_node(in_req(1)); Node* vec = apply_state.transformed_node(in_req(2)); @@ -1245,6 +1398,12 @@ VTransformApplyResult VTransformReductionVectorNode::apply(VTransformApplyState& return VTransformApplyResult::make_vector(vn, vn->vect_type()); } +float VTransformLoadVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + return vloop_analyzer.cost_for_vector_node(Op_LoadVector, vlen, bt); +} + VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& apply_state) const { int sopc = scalar_opcode(); uint vlen = vector_length(); @@ -1274,6 +1433,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(VTransformApplyState& appl return VTransformApplyResult::make_vector(vn, vn->vect_type()); } +float VTransformStoreVectorNode::cost(const VLoopAnalyzer& vloop_analyzer) const { + uint vlen = vector_length(); + BasicType bt = element_basic_type(); + return vloop_analyzer.cost_for_vector_node(Op_StoreVector, vlen, bt); +} + VTransformApplyResult VTransformStoreVectorNode::apply(VTransformApplyState& apply_state) const { int sopc = scalar_opcode(); uint vlen = vector_length(); diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 7ad7b432e9b43..a30f0ff098faf 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -51,6 +51,10 @@ // - Compute linearization of the VTransformGraph, into an order that respects // all edges in the graph (bailout if cycle detected). // +// - Cost-Model: +// - We use a cost-model as a heuristic to determine if vectorization is profitable. +// Compute the cost of the loop with and without vectorization. +// // - Apply: // - Changes to the C2 IR are only made once the "apply" method is called. // - Align the main loop, by adjusting pre loop limit. @@ -190,6 +194,7 @@ class VTransformGraph : public StackObj { void optimize(VTransform& vtransform); bool schedule(); bool has_store_to_load_forwarding_failure(const VLoopAnalyzer& vloop_analyzer) const; + float cost_for_vector_loop() const; void apply_vectorization_for_each_vtnode(uint& max_vector_length, uint& max_vector_width) const; private: @@ -200,6 +205,7 @@ class VTransformGraph : public StackObj { void collect_nodes_without_strong_in_edges(GrowableArray& stack) const; int count_alive_vtnodes() const; + void mark_vtnodes_in_loop(VectorSet& in_loop) const; #ifndef PRODUCT void print_vtnodes() const; @@ -252,6 +258,8 @@ class VTransform : public StackObj { void optimize() { return _graph.optimize(*this); } bool schedule() { return _graph.schedule(); } + bool is_profitable() const; + float cost_for_vector_loop() const { return _graph.cost_for_vector_loop(); } bool has_store_to_load_forwarding_failure() const { return _graph.has_store_to_load_forwarding_failure(_vloop_analyzer); } void apply(); @@ -549,6 +557,8 @@ class VTransformNode : public ArenaObj { virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) { return false; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const = 0; + virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const = 0; virtual void apply_backedge(VTransformApplyState& apply_state) const {}; void apply_vtn_inputs_to_node(Node* n, VTransformApplyState& apply_state) const; @@ -579,6 +589,7 @@ class VTransformMemopScalarNode : public VTransformNode { virtual bool is_load_or_store_in_loop() const override { return true; } virtual const VPointer& vpointer() const override { return _vpointer; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "MemopScalar"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -595,6 +606,7 @@ class VTransformDataScalarNode : public VTransformNode { assert(!_node->is_Mem() && !_node->is_Phi() && !_node->is_CFG(), "must be data node: %s", _node->Name()); } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "DataScalar"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -612,6 +624,7 @@ class VTransformLoopPhiNode : public VTransformNode { } virtual VTransformLoopPhiNode* isa_LoopPhi() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; virtual void apply_backedge(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoopPhi"; };) @@ -629,6 +642,7 @@ class VTransformCFGNode : public VTransformNode { assert(_node->is_CFG(), "must be CFG node: %s", _node->Name()); } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "CFG"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -655,6 +669,7 @@ class VTransformOuterNode : public VTransformNode { VTransformNode(vtransform, n->req()), _node(n) {} virtual VTransformOuterNode* isa_Outer() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { ShouldNotReachHere(); } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "Outer"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -668,6 +683,7 @@ class VTransformReplicateNode : public VTransformNode { public: VTransformReplicateNode(VTransform& vtransform, int vlen, BasicType element_type) : VTransformNode(vtransform, 2), _vlen(vlen), _element_type(element_type) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "Replicate"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -677,6 +693,7 @@ class VTransformReplicateNode : public VTransformNode { class VTransformConvI2LNode : public VTransformNode { public: VTransformConvI2LNode(VTransform& vtransform) : VTransformNode(vtransform, 2) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ConvI2L"; };) }; @@ -691,6 +708,7 @@ class VTransformShiftCountNode : public VTransformNode { public: VTransformShiftCountNode(VTransform& vtransform, int vlen, BasicType element_bt, juint mask, int shift_opcode) : VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt), _mask(mask), _shift_opcode(shift_opcode) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ShiftCount"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -704,6 +722,7 @@ class VTransformPopulateIndexNode : public VTransformNode { public: VTransformPopulateIndexNode(VTransform& vtransform, int vlen, const BasicType element_bt) : VTransformNode(vtransform, 2), _vlen(vlen), _element_bt(element_bt) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "PopulateIndex"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -769,6 +788,7 @@ class VTransformElementWiseVectorNode : public VTransformVectorNode { VTransformElementWiseVectorNode(VTransform& vtransform, uint req, const VTransformVectorNodeProperties properties, const int vector_opcode) : VTransformVectorNode(vtransform, req, properties), _vector_opcode(vector_opcode) {} virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -781,6 +801,7 @@ class VTransformElementWiseLongOpWithCastToIntVectorNode : public VTransformVect public: VTransformElementWiseLongOpWithCastToIntVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) : VTransformVectorNode(vtransform, 2, properties) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ElementWiseLongOpWithCastToIntVector"; };) }; @@ -791,6 +812,7 @@ class VTransformReinterpretVectorNode : public VTransformVectorNode { public: VTransformReinterpretVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties, const BasicType src_bt) : VTransformVectorNode(vtransform, 2, properties), _src_bt(src_bt) {} + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ReinterpretVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -811,6 +833,7 @@ class VTransformCmpVectorNode : public VTransformVectorNode { VTransformCmpVectorNode(VTransform& vtransform, const VTransformVectorNodeProperties properties) : VTransformVectorNode(vtransform, 3, properties) {} virtual VTransformCmpVectorNode* isa_CmpVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override { return 0; } virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override { return VTransformApplyResult::make_empty(); } NOT_PRODUCT(virtual const char* name() const override { return "CmpVector"; };) }; @@ -823,6 +846,7 @@ class VTransformBoolVectorNode : public VTransformVectorNode { VTransformVectorNode(vtransform, 2, properties), _test(test) {} VTransformBoolTest test() const { return _test; } virtual VTransformBoolVectorNode* isa_BoolVector() override { return this; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "BoolVector"; };) NOT_PRODUCT(virtual void print_spec() const override;) @@ -835,6 +859,7 @@ class VTransformReductionVectorNode : public VTransformVectorNode { VTransformVectorNode(vtransform, 3, properties) {} virtual VTransformReductionVectorNode* isa_ReductionVector() override { return this; } virtual bool optimize(const VLoopAnalyzer& vloop_analyzer, VTransform& vtransform) override; + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };) @@ -877,6 +902,7 @@ class VTransformLoadVectorNode : public VTransformMemVectorNode { LoadNode::ControlDependency control_dependency() const; virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } virtual bool is_load_in_loop() const override { return true; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) }; @@ -888,6 +914,7 @@ class VTransformStoreVectorNode : public VTransformMemVectorNode { VTransformMemVectorNode(vtransform, 4, properties, vpointer, adr_type) {} virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } virtual bool is_load_in_loop() const override { return false; } + virtual float cost(const VLoopAnalyzer& vloop_analyzer) const override; virtual VTransformApplyResult apply(VTransformApplyState& apply_state) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) }; diff --git a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java index 457e33667b2d1..76c33ec1b0772 100644 --- a/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java +++ b/test/hotspot/jtreg/compiler/c2/cr7200264/TestIntVect.java @@ -410,12 +410,12 @@ public void run() { } - // Not vectorized: simple addition not profitable, see JDK-8307516. NOTE: - // This check does not document the _desired_ behavior of the system but - // the current behavior (no vectorization) @Test - @IR(counts = { IRNode.LOAD_VECTOR_I, "= 0", - IRNode.STORE_VECTOR, "= 0" }) + @IR(counts = { IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0" }) + // The reduction is moved outside the loop, and we use a + // element-wise accumulator inside the loop. int test_sum(int[] a1) { int sum = 0; for (int i = 0; i < a1.length; i+=1) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java index 10ad19d03a74d..89b46871cb56a 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAutoVectorizationOverrideProfitability.java @@ -115,17 +115,18 @@ public static void checkSimpleFloatCopy() { @Test @Warmup(10) @IR(applyIfCPUFeatureOr = {"avx", "true"}, - applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}, counts = {IRNode.ADD_REDUCTION_VI, "> 0", IRNode.ADD_VI, "> 0"}) @IR(applyIfCPUFeatureOr = {"avx", "true"}, - applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}, counts = {IRNode.ADD_REDUCTION_VI, "= 0", IRNode.ADD_VI, "= 0"}) - // Current heuristics say that this simple int reduction is not profitable. - // But it would actually be profitable, since we are able to move the - // reduction out of the loop (we can reorder the reduction). When moving - // the reduction out of the loop, we instead accumulate with a simple - // ADD_VI inside the loop. - // See: JDK-8307516 JDK-8345044 + // We are able to vectorize the reduction. But on its own, that would + // not reduce the cost sufficiently in all cases, because vectorized + // reduction nodes are expensive. But since integer addition is associative + // we can move the reduction vector out of the loop. Instead, we accumulate + // with a simple ADD_VI inside the loop, which is very cheap. After the + // loop, we only need to use the vectorized reduction once, to collapse + // the partial sums contained in the lanes. private static int simpleIntReduction() { int sum = 0; for (int i = 0; i < aI.length; i++) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java new file mode 100644 index 0000000000000..1cd5cfa1e750c --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java @@ -0,0 +1,2452 @@ +/* + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test id=no-vectorization + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions P0 + */ + +/* + * @test id=vanilla + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions P1 + */ + +/* + * @test id=force-vectorization + * @bug 8340093 + * @summary Test vectorization of reduction loops. + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestReductions P2 + */ + +package compiler.loopopts.superword; + +import java.util.Map; +import java.util.HashMap; + +import compiler.lib.ir_framework.*; +import compiler.lib.verify.*; +import static compiler.lib.generators.Generators.G; +import compiler.lib.generators.Generator; + +/** + * Note: there is a corresponding JMH benchmark: + * test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java + */ +public class TestReductions { + private static int SIZE = 1024*8; + private static final Generator GEN_I = G.ints(); + private static final Generator GEN_L = G.longs(); + private static final Generator GEN_F = G.floats(); + private static final Generator GEN_D = G.doubles(); + + private static byte[] in1B = fillRandom(new byte[SIZE]); + private static byte[] in2B = fillRandom(new byte[SIZE]); + private static byte[] in3B = fillRandom(new byte[SIZE]); + private static char[] in1C = fillRandom(new char[SIZE]); + private static char[] in2C = fillRandom(new char[SIZE]); + private static char[] in3C = fillRandom(new char[SIZE]); + private static short[] in1S = fillRandom(new short[SIZE]); + private static short[] in2S = fillRandom(new short[SIZE]); + private static short[] in3S = fillRandom(new short[SIZE]); + + private static int[] in1I = fillRandom(new int[SIZE]); + private static int[] in2I = fillRandom(new int[SIZE]); + private static int[] in3I = fillRandom(new int[SIZE]); + private static long[] in1L = fillRandom(new long[SIZE]); + private static long[] in2L = fillRandom(new long[SIZE]); + private static long[] in3L = fillRandom(new long[SIZE]); + + private static float[] in1F = fillRandom(new float[SIZE]); + private static float[] in2F = fillRandom(new float[SIZE]); + private static float[] in3F = fillRandom(new float[SIZE]); + private static double[] in1D = fillRandom(new double[SIZE]); + private static double[] in2D = fillRandom(new double[SIZE]); + private static double[] in3D = fillRandom(new double[SIZE]); + + interface TestFunction { + Object run(); + } + + // Map of test names to tests. + Map tests = new HashMap(); + + // Map of gold, the results from the first run (before compilation), one per tests entry. + Map golds = new HashMap(); + + public static void main(String[] args) { + TestFramework framework = new TestFramework(TestReductions.class); + switch (args[0]) { + case "P0" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=0"); } + case "P1" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=1"); } + // Note: increasing the node count limit also helps in some cases. + case "P2" -> { framework.addFlags("-XX:+UnlockDiagnosticVMOptions", "-XX:AutoVectorizationOverrideProfitability=2", "-XX:LoopUnrollLimit=1000"); } + default -> { throw new RuntimeException("Test argument not recognized: " + args[0]); } + }; + framework.start(); + } + + public TestReductions() { + // Add all tests to list + tests.put("byteAndSimple", TestReductions::byteAndSimple); + tests.put("byteOrSimple", TestReductions::byteOrSimple); + tests.put("byteXorSimple", TestReductions::byteXorSimple); + tests.put("byteAddSimple", TestReductions::byteAddSimple); + tests.put("byteMulSimple", TestReductions::byteMulSimple); + tests.put("byteMinSimple", TestReductions::byteMinSimple); + tests.put("byteMaxSimple", TestReductions::byteMaxSimple); + tests.put("byteAndDotProduct", TestReductions::byteAndDotProduct); + tests.put("byteOrDotProduct", TestReductions::byteOrDotProduct); + tests.put("byteXorDotProduct", TestReductions::byteXorDotProduct); + tests.put("byteAddDotProduct", TestReductions::byteAddDotProduct); + tests.put("byteMulDotProduct", TestReductions::byteMulDotProduct); + tests.put("byteMinDotProduct", TestReductions::byteMinDotProduct); + tests.put("byteMaxDotProduct", TestReductions::byteMaxDotProduct); + tests.put("byteAndBig", TestReductions::byteAndBig); + tests.put("byteOrBig", TestReductions::byteOrBig); + tests.put("byteXorBig", TestReductions::byteXorBig); + tests.put("byteAddBig", TestReductions::byteAddBig); + tests.put("byteMulBig", TestReductions::byteMulBig); + tests.put("byteMinBig", TestReductions::byteMinBig); + tests.put("byteMaxBig", TestReductions::byteMaxBig); + + tests.put("charAndSimple", TestReductions::charAndSimple); + tests.put("charOrSimple", TestReductions::charOrSimple); + tests.put("charXorSimple", TestReductions::charXorSimple); + tests.put("charAddSimple", TestReductions::charAddSimple); + tests.put("charMulSimple", TestReductions::charMulSimple); + tests.put("charMinSimple", TestReductions::charMinSimple); + tests.put("charMaxSimple", TestReductions::charMaxSimple); + tests.put("charAndDotProduct", TestReductions::charAndDotProduct); + tests.put("charOrDotProduct", TestReductions::charOrDotProduct); + tests.put("charXorDotProduct", TestReductions::charXorDotProduct); + tests.put("charAddDotProduct", TestReductions::charAddDotProduct); + tests.put("charMulDotProduct", TestReductions::charMulDotProduct); + tests.put("charMinDotProduct", TestReductions::charMinDotProduct); + tests.put("charMaxDotProduct", TestReductions::charMaxDotProduct); + tests.put("charAndBig", TestReductions::charAndBig); + tests.put("charOrBig", TestReductions::charOrBig); + tests.put("charXorBig", TestReductions::charXorBig); + tests.put("charAddBig", TestReductions::charAddBig); + tests.put("charMulBig", TestReductions::charMulBig); + tests.put("charMinBig", TestReductions::charMinBig); + tests.put("charMaxBig", TestReductions::charMaxBig); + + tests.put("shortAndSimple", TestReductions::shortAndSimple); + tests.put("shortOrSimple", TestReductions::shortOrSimple); + tests.put("shortXorSimple", TestReductions::shortXorSimple); + tests.put("shortAddSimple", TestReductions::shortAddSimple); + tests.put("shortMulSimple", TestReductions::shortMulSimple); + tests.put("shortMinSimple", TestReductions::shortMinSimple); + tests.put("shortMaxSimple", TestReductions::shortMaxSimple); + tests.put("shortAndDotProduct", TestReductions::shortAndDotProduct); + tests.put("shortOrDotProduct", TestReductions::shortOrDotProduct); + tests.put("shortXorDotProduct", TestReductions::shortXorDotProduct); + tests.put("shortAddDotProduct", TestReductions::shortAddDotProduct); + tests.put("shortMulDotProduct", TestReductions::shortMulDotProduct); + tests.put("shortMinDotProduct", TestReductions::shortMinDotProduct); + tests.put("shortMaxDotProduct", TestReductions::shortMaxDotProduct); + tests.put("shortAndBig", TestReductions::shortAndBig); + tests.put("shortOrBig", TestReductions::shortOrBig); + tests.put("shortXorBig", TestReductions::shortXorBig); + tests.put("shortAddBig", TestReductions::shortAddBig); + tests.put("shortMulBig", TestReductions::shortMulBig); + tests.put("shortMinBig", TestReductions::shortMinBig); + tests.put("shortMaxBig", TestReductions::shortMaxBig); + + tests.put("intAndSimple", TestReductions::intAndSimple); + tests.put("intOrSimple", TestReductions::intOrSimple); + tests.put("intXorSimple", TestReductions::intXorSimple); + tests.put("intAddSimple", TestReductions::intAddSimple); + tests.put("intMulSimple", TestReductions::intMulSimple); + tests.put("intMinSimple", TestReductions::intMinSimple); + tests.put("intMaxSimple", TestReductions::intMaxSimple); + tests.put("intAndDotProduct", TestReductions::intAndDotProduct); + tests.put("intOrDotProduct", TestReductions::intOrDotProduct); + tests.put("intXorDotProduct", TestReductions::intXorDotProduct); + tests.put("intAddDotProduct", TestReductions::intAddDotProduct); + tests.put("intMulDotProduct", TestReductions::intMulDotProduct); + tests.put("intMinDotProduct", TestReductions::intMinDotProduct); + tests.put("intMaxDotProduct", TestReductions::intMaxDotProduct); + tests.put("intAndBig", TestReductions::intAndBig); + tests.put("intOrBig", TestReductions::intOrBig); + tests.put("intXorBig", TestReductions::intXorBig); + tests.put("intAddBig", TestReductions::intAddBig); + tests.put("intMulBig", TestReductions::intMulBig); + tests.put("intMinBig", TestReductions::intMinBig); + tests.put("intMaxBig", TestReductions::intMaxBig); + + tests.put("longAndSimple", TestReductions::longAndSimple); + tests.put("longOrSimple", TestReductions::longOrSimple); + tests.put("longXorSimple", TestReductions::longXorSimple); + tests.put("longAddSimple", TestReductions::longAddSimple); + tests.put("longMulSimple", TestReductions::longMulSimple); + tests.put("longMinSimple", TestReductions::longMinSimple); + tests.put("longMaxSimple", TestReductions::longMaxSimple); + tests.put("longAndDotProduct", TestReductions::longAndDotProduct); + tests.put("longOrDotProduct", TestReductions::longOrDotProduct); + tests.put("longXorDotProduct", TestReductions::longXorDotProduct); + tests.put("longAddDotProduct", TestReductions::longAddDotProduct); + tests.put("longMulDotProduct", TestReductions::longMulDotProduct); + tests.put("longMinDotProduct", TestReductions::longMinDotProduct); + tests.put("longMaxDotProduct", TestReductions::longMaxDotProduct); + tests.put("longAndBig", TestReductions::longAndBig); + tests.put("longOrBig", TestReductions::longOrBig); + tests.put("longXorBig", TestReductions::longXorBig); + tests.put("longAddBig", TestReductions::longAddBig); + tests.put("longMulBig", TestReductions::longMulBig); + tests.put("longMinBig", TestReductions::longMinBig); + tests.put("longMaxBig", TestReductions::longMaxBig); + + tests.put("floatAddSimple", TestReductions::floatAddSimple); + tests.put("floatMulSimple", TestReductions::floatMulSimple); + tests.put("floatMinSimple", TestReductions::floatMinSimple); + tests.put("floatMaxSimple", TestReductions::floatMaxSimple); + tests.put("floatAddDotProduct", TestReductions::floatAddDotProduct); + tests.put("floatMulDotProduct", TestReductions::floatMulDotProduct); + tests.put("floatMinDotProduct", TestReductions::floatMinDotProduct); + tests.put("floatMaxDotProduct", TestReductions::floatMaxDotProduct); + tests.put("floatAddBig", TestReductions::floatAddBig); + tests.put("floatMulBig", TestReductions::floatMulBig); + tests.put("floatMinBig", TestReductions::floatMinBig); + tests.put("floatMaxBig", TestReductions::floatMaxBig); + + tests.put("doubleAddSimple", TestReductions::doubleAddSimple); + tests.put("doubleMulSimple", TestReductions::doubleMulSimple); + tests.put("doubleMinSimple", TestReductions::doubleMinSimple); + tests.put("doubleMaxSimple", TestReductions::doubleMaxSimple); + tests.put("doubleAddDotProduct", TestReductions::doubleAddDotProduct); + tests.put("doubleMulDotProduct", TestReductions::doubleMulDotProduct); + tests.put("doubleMinDotProduct", TestReductions::doubleMinDotProduct); + tests.put("doubleMaxDotProduct", TestReductions::doubleMaxDotProduct); + tests.put("doubleAddBig", TestReductions::doubleAddBig); + tests.put("doubleMulBig", TestReductions::doubleMulBig); + tests.put("doubleMinBig", TestReductions::doubleMinBig); + tests.put("doubleMaxBig", TestReductions::doubleMaxBig); + + // Compute gold value for all test methods before compilation + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + Object gold = test.run(); + golds.put(name, gold); + } + } + + @Warmup(100) + @Run(test = {"byteAndSimple", + "byteOrSimple", + "byteXorSimple", + "byteAddSimple", + "byteMulSimple", + "byteMinSimple", + "byteMaxSimple", + "byteAndDotProduct", + "byteOrDotProduct", + "byteXorDotProduct", + "byteAddDotProduct", + "byteMulDotProduct", + "byteMinDotProduct", + "byteMaxDotProduct", + "byteAndBig", + "byteOrBig", + "byteXorBig", + "byteAddBig", + "byteMulBig", + "byteMinBig", + "byteMaxBig", + + "charAndSimple", + "charOrSimple", + "charXorSimple", + "charAddSimple", + "charMulSimple", + "charMinSimple", + "charMaxSimple", + "charAndDotProduct", + "charOrDotProduct", + "charXorDotProduct", + "charAddDotProduct", + "charMulDotProduct", + "charMinDotProduct", + "charMaxDotProduct", + "charAndBig", + "charOrBig", + "charXorBig", + "charAddBig", + "charMulBig", + "charMinBig", + "charMaxBig", + + "shortAndSimple", + "shortOrSimple", + "shortXorSimple", + "shortAddSimple", + "shortMulSimple", + "shortMinSimple", + "shortMaxSimple", + "shortAndDotProduct", + "shortOrDotProduct", + "shortXorDotProduct", + "shortAddDotProduct", + "shortMulDotProduct", + "shortMinDotProduct", + "shortMaxDotProduct", + "shortAndBig", + "shortOrBig", + "shortXorBig", + "shortAddBig", + "shortMulBig", + "shortMinBig", + "shortMaxBig", + + "intAndSimple", + "intOrSimple", + "intXorSimple", + "intAddSimple", + "intMulSimple", + "intMinSimple", + "intMaxSimple", + "intAndDotProduct", + "intOrDotProduct", + "intXorDotProduct", + "intAddDotProduct", + "intMulDotProduct", + "intMinDotProduct", + "intMaxDotProduct", + "intAndBig", + "intOrBig", + "intXorBig", + "intAddBig", + "intMulBig", + "intMinBig", + "intMaxBig", + + "longAndSimple", + "longOrSimple", + "longXorSimple", + "longAddSimple", + "longMulSimple", + "longMinSimple", + "longMaxSimple", + "longAndDotProduct", + "longOrDotProduct", + "longXorDotProduct", + "longAddDotProduct", + "longMulDotProduct", + "longMinDotProduct", + "longMaxDotProduct", + "longAndBig", + "longOrBig", + "longXorBig", + "longAddBig", + "longMulBig", + "longMinBig", + "longMaxBig", + + "floatAddSimple", + "floatMulSimple", + "floatMinSimple", + "floatMaxSimple", + "floatAddDotProduct", + "floatMulDotProduct", + "floatMinDotProduct", + "floatMaxDotProduct", + "floatAddBig", + "floatMulBig", + "floatMinBig", + "floatMaxBig", + + "doubleAddSimple", + "doubleMulSimple", + "doubleMinSimple", + "doubleMaxSimple", + "doubleAddDotProduct", + "doubleMulDotProduct", + "doubleMinDotProduct", + "doubleMaxDotProduct", + "doubleAddBig", + "doubleMulBig", + "doubleMinBig", + "doubleMaxBig"}) + public void runTests() { + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + // Recall gold value from before compilation + Object gold = golds.get(name); + // Compute new result + Object result = test.run(); + // Compare gold and new result + try { + Verify.checkEQ(gold, result); + } catch (VerifyException e) { + throw new RuntimeException("Verify failed for " + name, e); + } + } + } + + static byte[] fillRandom(byte[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (byte)(int)GEN_I.next(); + } + return a; + } + + static char[] fillRandom(char[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (char)(int)GEN_I.next(); + } + return a; + } + + static short[] fillRandom(short[] a) { + for (int i = 0; i < a.length; i++) { + a[i] = (short)(int)GEN_I.next(); + } + return a; + } + + static int[] fillRandom(int[] a) { + G.fill(GEN_I, a); + return a; + } + + static long[] fillRandom(long[] a) { + G.fill(GEN_L, a); + return a; + } + + static float[] fillRandom(float[] a) { + G.fill(GEN_F, a); + return a; + } + + static double[] fillRandom(double[] a) { + G.fill(GEN_D, a); + return a; + } + + // ---------byte***Simple ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAndSimple() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteOrSimple() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteXorSimple() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAddSimple() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMulSimple() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMinSimple() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMaxSimple() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = in1B[i]; + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------byte***DotProduct ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAndDotProduct() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteOrDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteXorDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAddDotProduct() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMulDotProduct() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMinDotProduct() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMaxDotProduct() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)(in1B[i] * in2B[i]); + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------byte***Big ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAndBig() { + byte acc = (byte)0xFF; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteOrBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteXorBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteAddBig() { + byte acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMulBig() { + byte acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMinBig() { + byte acc = Byte.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc = (byte)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_B) // does not vectorize for now, might in the future. + private static byte byteMaxBig() { + byte acc = Byte.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + byte val = (byte)((in1B[i] * in2B[i]) + (in1B[i] * in3B[i]) + (in2B[i] * in3B[i])); + acc = (byte)Math.max(acc, val); + } + return acc; + } + + // ---------char***Simple ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAndSimple() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charOrSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charXorSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAddSimple() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMulSimple() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMinSimple() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMaxSimple() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = in1C[i]; + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------char***DotProduct ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAndDotProduct() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charOrDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charXorDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAddDotProduct() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMulDotProduct() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMinDotProduct() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMaxDotProduct() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)(in1C[i] * in2C[i]); + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------char***Big ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAndBig() { + char acc = (char)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charOrBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charXorBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charAddBig() { + char acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMulBig() { + char acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMinBig() { + char acc = Character.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc = (char)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_C) // does not vectorize for now, might in the future. + private static char charMaxBig() { + char acc = Character.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + char val = (char)((in1C[i] * in2C[i]) + (in1C[i] * in3C[i]) + (in2C[i] * in3C[i])); + acc = (char)Math.max(acc, val); + } + return acc; + } + + // ---------short***Simple ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAndSimple() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortOrSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortXorSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAddSimple() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMulSimple() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMinSimple() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMaxSimple() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = in1S[i]; + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------short***DotProduct ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAndDotProduct() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortOrDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortXorDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAddDotProduct() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMulDotProduct() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMinDotProduct() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMaxDotProduct() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)(in1S[i] * in2S[i]); + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------short***Big ------------------------------------------------------------ + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAndBig() { + short acc = (short)0xFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc &= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortOrBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc |= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortXorBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc ^= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortAddBig() { + short acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc += val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMulBig() { + short acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc *= val; + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMinBig() { + short acc = Short.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc = (short)Math.min(acc, val); + } + return acc; + } + + @Test + @IR(failOn = IRNode.LOAD_VECTOR_S) // does not vectorize for now, might in the future. + private static short shortMaxBig() { + short acc = Short.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + short val = (short)((in1S[i] * in2S[i]) + (in1S[i] * in3S[i]) + (in2S[i] * in3S[i])); + acc = (short)Math.max(acc, val); + } + return acc; + } + + // ---------int***Simple ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAndSimple() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intOrSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intXorSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAddSimple() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMulSimple() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMinSimple() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMaxSimple() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------int***DotProduct ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAndDotProduct() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intOrDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intXorDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAddDotProduct() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMulDotProduct() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMinDotProduct() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMaxDotProduct() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = in1I[i] * in2I[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------int***Big ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAndBig() { + int acc = 0xFFFFFFFF; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intOrBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intXorBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intAddBig() { + int acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MUL_REDUCTION_VI, "> 0", + IRNode.MUL_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMulBig() { + int acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMinBig() { + int acc = Integer.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VI, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_I, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static int intMaxBig() { + int acc = Integer.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + int val = (in1I[i] * in2I[i]) + (in1I[i] * in3I[i]) + (in2I[i] * in3I[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***Simple ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAndSimple() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddSimple() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, // vector accumulator + applyIfCPUFeature = {"avx512dq", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370673 + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "= 0"}, // Reduction NOT moved out of loop + applyIfCPUFeatureOr = {"asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + // Note: NEON does not support MulVL for auto vectorization. There is + // a scalarized implementation, but that is not profitable for + // auto vectorization in almost all cases, and would not be + // profitable here at any rate. + // Hence, we have to keep the reduction inside the loop, and + // cannot use the MulVL as the vector accumulator. + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMulSimple() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinSimple() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeatureOr = {"avx512", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMaxSimple() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***DotProduct ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAndDotProduct() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddDotProduct() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, + applyIfCPUFeature = {"avx512dq", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512dq", "false", "sse4.1", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370673 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // MulVL is not implemented on NEON, so we also not have the reduction. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMulDotProduct() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeature = {"avx512", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinDotProduct() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeature = {"avx512", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMaxDotProduct() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = in1L[i] * in2L[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------long***Big ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.AND_REDUCTION_V, "> 0", + IRNode.AND_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While AndReductionV is implemented in NEON (see longAndSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAndBig() { + long acc = 0xFFFFFFFFFFFFFFFFL; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc &= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.OR_REDUCTION_V, "> 0", + IRNode.OR_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While OrReductionV is implemented in NEON (see longOrSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longOrBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc |= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.XOR_REDUCTION_V, "> 0", + IRNode.XOR_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longXorSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longXorBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc ^= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_REDUCTION_VL, "> 0", + IRNode.ADD_VL, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longAddSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longAddBig() { + long acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MUL_REDUCTION_VL, "> 0", + IRNode.MUL_VL, "> 0"}, + applyIfCPUFeature = {"avx512dq", "true"}, + applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0", + "LoopUnrollLimit", ">= 1000"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeature = {"avx512dq", "true"}, + applyIfAnd = {"AutoVectorizationOverrideProfitability", "> 0", + "LoopUnrollLimit", "< 1000"}) + // Increasing the body limit seems to help. Filed for investigation: JDK-8370685 + // If you can eliminate this exception for LoopUnrollLimit, please remove + // the flag completely from the test, also the "addFlags" at the top. + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // MulVL is not implemented on NEON, so we also not have the reduction. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMulBig() { + long acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VL, "> 0"}, + applyIfCPUFeature = {"avx512", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMinSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMinBig() { + long acc = Long.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VL, "> 0"}, + applyIfCPUFeature = {"avx512", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"avx512", "false", "avx2", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370671 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // While MaxReductionV is implemented in NEON (see longMaxSimple), MulVL is not. + // Filed: JDK-8370686 + @IR(failOn = IRNode.LOAD_VECTOR_L, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static long longMaxBig() { + long acc = Long.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + long val = (in1L[i] * in2L[i]) + (in1L[i] * in3L[i]) + (in2L[i] * in3L[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***Simple ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n adds + // Vector: n loads + n adds + n extract (sequential order of reduction) + private static float floatAddSimple() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n mul + // Vector: n loads + n mul + n extract (sequential order of reduction) + private static float floatMulSimple() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMinSimple() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMaxSimple() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***DotProduct ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatAddDotProduct() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMulDotProduct() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMinDotProduct() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMaxDotProduct() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = in1F[i] * in2F[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------float***Big ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VF, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatAddBig() { + float acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MUL_REDUCTION_VF, "> 0", + IRNode.MUL_VF, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMulBig() { + float acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMinBig() { + float acc = Float.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_F, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VF, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_F, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static float floatMaxBig() { + float acc = Float.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + float val = (in1F[i] * in2F[i]) + (in1F[i] * in3F[i]) + (in2F[i] * in3F[i]); + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***Simple ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_VD, "> 0", + IRNode.ADD_VD, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n adds + // Vector: n loads + n adds + n extract (sequential order of reduction) + private static double doubleAddSimple() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "= 2"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "< 2"}) + // Not considered profitable by cost model, but if forced we can vectorize. + // Scalar: n loads + n mul + // Vector: n loads + n mul + n extract (sequential order of reduction) + private static double doubleMulSimple() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMinSimple() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMaxSimple() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***DotProduct ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VD, "= 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleAddDotProduct() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMulDotProduct() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMinDotProduct() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMaxDotProduct() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = in1D[i] * in2D[i]; + acc = Math.max(acc, val); + } + return acc; + } + + // ---------double***Big ------------------------------------------------------------ + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.ADD_REDUCTION_V, "> 0", + IRNode.ADD_VD, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleAddBig() { + double acc = 0; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc += val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MUL_REDUCTION_VD, "> 0", + IRNode.MUL_VD, "> 0"}, + applyIfCPUFeature = {"sse4.1", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIfCPUFeatureAnd = {"asimd", "true"}) + // I think this could vectorize, but currently does not. Filed: JDK-8370677 + // But: it is not clear that it would be profitable, given the sequential reduction. + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMulBig() { + double acc = 1; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc *= val; + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MIN_REDUCTION_V, "> 0", + IRNode.MIN_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMinBig() { + double acc = Double.MAX_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc = Math.min(acc, val); + } + return acc; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_D, "> 0", + IRNode.MAX_REDUCTION_V, "> 0", + IRNode.MAX_VD, "> 0"}, + applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}, + applyIf = {"AutoVectorizationOverrideProfitability", "> 0"}) + @IR(failOn = IRNode.LOAD_VECTOR_D, + applyIf = {"AutoVectorizationOverrideProfitability", "= 0"}) + private static double doubleMaxBig() { + double acc = Double.MIN_VALUE; // neutral element + for (int i = 0; i < SIZE; i++) { + double val = (in1D[i] * in2D[i]) + (in1D[i] * in3D[i]) + (in2D[i] * in3D[i]); + acc = Math.max(acc, val); + } + return acc; + } + + +} diff --git a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java index ec614cb324bc2..63fbf03008301 100644 --- a/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java +++ b/test/micro/org/openjdk/bench/vm/compiler/VectorReduction2.java @@ -28,6 +28,10 @@ import java.util.concurrent.TimeUnit; import java.util.Random; +/** + * Note: there is a corresponding IR test: + * test/hotspot/jtreg/compiler/loopopts/superword/TestReductions.java + */ @BenchmarkMode(Mode.AverageTime) @OutputTimeUnit(TimeUnit.NANOSECONDS) @State(Scope.Thread)