From 82bd351ef9d024de1ef40fce2ac2ca4630ca19ee Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 6 Nov 2024 14:06:36 +0100 Subject: [PATCH 001/130] 8343685 --- src/hotspot/share/opto/vectorization.cpp | 1168 +--------------------- src/hotspot/share/opto/vectorization.hpp | 171 +--- 2 files changed, 36 insertions(+), 1303 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index fc4eaccff5ce5..9259f6bbe29f1 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -405,10 +405,6 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -#ifndef PRODUCT -int VPointer::Tracer::_depth = 0; -#endif - VPointer::VPointer(MemNode* const mem, const VLoop& vloop, Node_Stack* nstack, bool analyze_only) : _mem(mem), _vloop(vloop), @@ -416,117 +412,11 @@ VPointer::VPointer(MemNode* const mem, const VLoop& vloop, #ifdef ASSERT _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr), #endif - _has_int_index_after_convI2L(false), - _int_index_after_convI2L_offset(0), - _int_index_after_convI2L_invar(nullptr), - _int_index_after_convI2L_scale(0), _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0) -#ifndef PRODUCT - , _tracer(vloop.is_trace_pointer_analysis()) -#endif { - NOT_PRODUCT(_tracer.ctor_1(mem);) - - Node* adr = mem->in(MemNode::Address); - if (!adr->is_AddP()) { - assert(!valid(), "too complex"); - return; - } - // Match AddP(base, AddP(ptr, k*iv [+ invariant]), constant) - Node* base = adr->in(AddPNode::Base); - // The base address should be loop invariant - if (is_loop_member(base)) { - assert(!valid(), "base address is loop variant"); - return; - } - // unsafe references require misaligned vector access support - if (base->is_top() && !Matcher::misaligned_vectors_ok()) { - assert(!valid(), "unsafe access"); - return; - } - - NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.store_depth();) - NOT_PRODUCT(_tracer.ctor_2(adr);) - - int i; - for (i = 0; ; i++) { - NOT_PRODUCT(_tracer.ctor_3(adr, i);) - - if (!scaled_iv_plus_offset(adr->in(AddPNode::Offset))) { - assert(!valid(), "too complex"); - return; - } - adr = adr->in(AddPNode::Address); - NOT_PRODUCT(_tracer.ctor_4(adr, i);) - - if (base == adr || !adr->is_AddP()) { - NOT_PRODUCT(_tracer.ctor_5(adr, base, i);) - break; // stop looking at addp's - } - } - if (!invariant(adr)) { - // The address must be invariant for the current loop. But if we are in a main-loop, - // it must also be invariant of the pre-loop, otherwise we cannot use this address - // for the pre-loop limit adjustment required for main-loop alignment. - assert(!valid(), "adr is loop variant"); - return; - } - - if (!base->is_top() && adr != base) { - assert(!valid(), "adr and base differ"); - return; - } - - NOT_PRODUCT(if(_tracer._is_trace_alignment) _tracer.restore_depth();) - NOT_PRODUCT(_tracer.ctor_6(mem);) - - // In the pointer analysis, and especially the AlignVector, analysis we assume that - // stride and scale are not too large. For example, we multiply "scale * stride", - // and assume that this does not overflow the int range. We also take "abs(scale)" - // and "abs(stride)", which would overflow for min_int = -(2^31). Still, we want - // to at least allow small and moderately large stride and scale. Therefore, we - // allow values up to 2^30, which is only a factor 2 smaller than the max/min int. - // Normal performance relevant code will have much lower values. And the restriction - // allows us to keep the rest of the autovectorization code much simpler, since we - // do not have to deal with overflows. - jlong long_scale = _scale; - jlong long_stride = _vloop.iv_stride(); - jlong max_val = 1 << 30; - if (abs(long_scale) >= max_val || - abs(long_stride) >= max_val || - abs(long_scale * long_stride) >= max_val) { - assert(!valid(), "adr stride*scale is too large"); - return; - } - - if (!is_safe_to_use_as_simple_form(base, adr)) { - assert(!valid(), "does not have simple form"); - return; - } - - _base = base; - _adr = adr; - assert(valid(), "Usable"); + assert(!valid(), "all must be invalid!!!"); } -// Following is used to create a temporary object during -// the pattern match of an address expression. -VPointer::VPointer(VPointer* p) : - _mem(p->_mem), _vloop(p->_vloop), - _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr), -#ifdef ASSERT - _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr), -#endif - _has_int_index_after_convI2L(false), - _int_index_after_convI2L_offset(0), - _int_index_after_convI2L_invar(nullptr), - _int_index_after_convI2L_scale(0), - _nstack(p->_nstack), _analyze_only(p->_analyze_only), _stack_idx(p->_stack_idx) -#ifndef PRODUCT - , _tracer(p->_tracer._is_trace_alignment) -#endif -{} - // Biggest detectable factor of the invariant. int VPointer::invar_factor() const { Node* n = invar(); @@ -543,783 +433,6 @@ int VPointer::invar_factor() const { return 1; } -// We would like to make decisions about aliasing (i.e. removing memory edges) and adjacency -// (i.e. which loads/stores can be packed) based on the simple form: -// -// s_pointer = adr + offset + invar + scale * ConvI2L(iv) -// -// However, we parse the compound-long-int form: -// -// c_pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_index) -// int_index = int_offset + int_invar + int_scale * iv -// -// In general, the simple and the compound-long-int form do not always compute the same pointer -// at runtime. For example, the simple form would give a different result due to an overflow -// in the int_index. -// -// Example: -// For both forms, we have: -// iv = 0 -// scale = 1 -// -// We now account the offset and invar once to the long part and once to the int part: -// Pointer 1 (long offset and long invar): -// long_offset = min_int -// long_invar = min_int -// int_offset = 0 -// int_invar = 0 -// -// Pointer 2 (int offset and int invar): -// long_offset = 0 -// long_invar = 0 -// int_offset = min_int -// int_invar = min_int -// -// This gives us the following pointers: -// Compound-long-int form pointers: -// Form: -// c_pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + int_invar + int_scale * iv) -// -// Pointers: -// c_pointer1 = adr + min_int + min_int + 1 * ConvI2L(0 + 0 + 1 * 0) -// = adr + min_int + min_int -// = adr - 2^32 -// -// c_pointer2 = adr + 0 + 0 + 1 * ConvI2L(min_int + min_int + 1 * 0) -// = adr + ConvI2L(min_int + min_int) -// = adr + 0 -// = adr -// -// Simple form pointers: -// Form: -// s_pointer = adr + offset + invar + scale * ConvI2L(iv) -// s_pointer = adr + (long_offset + int_offset) + (long_invar + int_invar) + (long_scale * int_scale) * ConvI2L(iv) -// -// Pointers: -// s_pointer1 = adr + (min_int + 0 ) + (min_int + 0 ) + 1 * 0 -// = adr + min_int + min_int -// = adr - 2^32 -// s_pointer2 = adr + (0 + min_int ) + (0 + min_int ) + 1 * 0 -// = adr + min_int + min_int -// = adr - 2^32 -// -// We see that the two addresses are actually 2^32 bytes apart (derived from the c_pointers), but their simple form look identical. -// -// Hence, we need to determine in which cases it is safe to make decisions based on the simple -// form, rather than the compound-long-int form. If we cannot prove that using the simple form -// is safe (i.e. equivalent to the compound-long-int form), then we do not get a valid VPointer, -// and the associated memop cannot be vectorized. -bool VPointer::is_safe_to_use_as_simple_form(Node* base, Node* adr) const { -#ifndef _LP64 - // On 32-bit platforms, there is never an explicit int_index with ConvI2L for the iv. Thus, the - // parsed pointer form is always the simple form, with int operations: - // - // pointer = adr + offset + invar + scale * iv - // - assert(!_has_int_index_after_convI2L, "32-bit never has an int_index with ConvI2L for the iv"); - return true; -#else - - // Array accesses that are not Unsafe always have a RangeCheck which ensures that there is no - // int_index overflow. This implies that the conversion to long can be done separately: - // - // ConvI2L(int_index) = ConvI2L(int_offset) + ConvI2L(int_invar) + ConvI2L(scale) * ConvI2L(iv) - // - // And hence, the simple form is guaranteed to be identical to the compound-long-int form at - // runtime and the VPointer is safe/valid to be used. - const TypeAryPtr* ary_ptr_t = _mem->adr_type()->isa_aryptr(); - if (ary_ptr_t != nullptr) { - if (!_mem->is_unsafe_access()) { - return true; - } - } - - // We did not find the int_index. Just to be safe, reject this VPointer. - if (!_has_int_index_after_convI2L) { - return false; - } - - int int_offset = _int_index_after_convI2L_offset; - Node* int_invar = _int_index_after_convI2L_invar; - int int_scale = _int_index_after_convI2L_scale; - int long_scale = _scale / int_scale; - - // If "int_index = iv", then the simple form is identical to the compound-long-int form. - // - // int_index = int_offset + int_invar + int_scale * iv - // = 0 0 1 * iv - // = iv - if (int_offset == 0 && int_invar == nullptr && int_scale == 1) { - return true; - } - - // Intuition: What happens if the int_index overflows? Let us look at two pointers on the "overflow edge": - // - // pointer1 = adr + ConvI2L(int_index1) - // pointer2 = adr + ConvI2L(int_index2) - // - // int_index1 = max_int + 0 = max_int -> very close to but before the overflow - // int_index2 = max_int + 1 = min_int -> just enough to get the overflow - // - // When looking at the difference of pointer1 and pointer2, we notice that it is very large - // (almost 2^32). Since arrays have at most 2^31 elements, chances are high that pointer2 is - // an actual out-of-bounds access at runtime. These would normally be prevented by range checks - // at runtime. However, if the access was done by using Unsafe, where range checks are omitted, - // then an out-of-bounds access constitutes undefined behavior. This means that we are allowed to - // do anything, including changing the behavior. - // - // If we can set the right conditions, we have a guarantee that an overflow is either impossible - // (no overflow or range checks preventing that) or undefined behavior. In both cases, we are - // safe to do a vectorization. - // - // Approach: We want to prove a lower bound for the distance between these two pointers, and an - // upper bound for the size of a memory object. We can derive such an upper bound for - // arrays. We know they have at most 2^31 elements. If we know the size of the elements - // in bytes, we have: - // - // array_element_size_in_bytes * 2^31 >= max_possible_array_size_in_bytes - // >= array_size_in_bytes (ARR) - // - // If some small difference "delta" leads to an int_index overflow, we know that the - // int_index1 before overflow must have been close to max_int, and the int_index2 after - // the overflow must be close to min_int: - // - // pointer1 = adr + long_offset + long_invar + long_scale * ConvI2L(int_index1) - // =approx adr + long_offset + long_invar + long_scale * max_int - // - // pointer2 = adr + long_offset + long_invar + long_scale * ConvI2L(int_index2) - // =approx adr + long_offset + long_invar + long_scale * min_int - // - // We realize that the pointer difference is very large: - // - // difference =approx long_scale * 2^32 - // - // Hence, if we set the right condition for long_scale and array_element_size_in_bytes, - // we can prove that an overflow is impossible (or would imply undefined behaviour). - // - // We must now take this intuition, and develop a rigorous proof. We start by stating the problem - // more precisely, with the help of some definitions and the Statement we are going to prove. - // - // Definition: - // Two VPointers are "comparable" (i.e. VPointer::comparable is true, set with VPointer::cmp()), - // iff all of these conditions apply for the simple form: - // 1) Both VPointers are valid. - // 2) The adr are identical, or both are array bases of different arrays. - // 3) They have identical scale. - // 4) They have identical invar. - // 5) The difference in offsets is limited: abs(offset1 - offset2) < 2^31. (DIFF) - // - // For the Vectorization Optimization, we pair-wise compare VPointers and determine if they are: - // 1) "not comparable": - // We do not optimize them (assume they alias, not assume adjacency). - // - // Whenever we chose this option based on the simple form, it is also correct based on the - // compound-long-int form, since we make no optimizations based on it. - // - // 2) "comparable" with different array bases at runtime: - // We assume they do not alias (remove memory edges), but not assume adjacency. - // - // Whenever we have two different array bases for the simple form, we also have different - // array bases for the compound-long-form. Since VPointers provably point to different - // memory objects, they can never alias. - // - // 3) "comparable" with the same base address: - // We compute the relative pointer difference, and based on the load/store size we can - // compute aliasing and adjacency. - // - // We must find a condition under which the pointer difference of the simple form is - // identical to the pointer difference of the compound-long-form. We do this with the - // Statement below, which we then proceed to prove. - // - // Statement: - // If two VPointers satisfy these 3 conditions: - // 1) They are "comparable". - // 2) They have the same base address. - // 3) Their long_scale is a multiple of the array element size in bytes: - // - // abs(long_scale) % array_element_size_in_bytes = 0 (A) - // - // Then their pointer difference of the simple form is identical to the pointer difference - // of the compound-long-int form. - // - // More precisely: - // Such two VPointers by definition have identical adr, invar, and scale. - // Their simple form is: - // - // s_pointer1 = adr + offset1 + invar + scale * ConvI2L(iv) (B1) - // s_pointer2 = adr + offset2 + invar + scale * ConvI2L(iv) (B2) - // - // Thus, the pointer difference of the simple forms collapses to the difference in offsets: - // - // s_difference = s_pointer1 - s_pointer2 = offset1 - offset2 (C) - // - // Their compound-long-int form for these VPointer is: - // - // c_pointer1 = adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1) (D1) - // int_index1 = int_offset1 + int_invar1 + int_scale1 * iv (D2) - // - // c_pointer2 = adr + long_offset2 + long_invar2 + long_scale2 * ConvI2L(int_index2) (D3) - // int_index2 = int_offset2 + int_invar2 + int_scale2 * iv (D4) - // - // And these are the offset1, offset2, invar and scale from the simple form (B1) and (B2): - // - // offset1 = long_offset1 + long_scale1 * ConvI2L(int_offset1) (D5) - // offset2 = long_offset2 + long_scale2 * ConvI2L(int_offset2) (D6) - // - // invar = long_invar1 + long_scale1 * ConvI2L(int_invar1) - // = long_invar2 + long_scale2 * ConvI2L(int_invar2) (D7) - // - // scale = long_scale1 * ConvI2L(int_scale1) - // = long_scale2 * ConvI2L(int_scale2) (D8) - // - // The pointer difference of the compound-long-int form is defined as: - // - // c_difference = c_pointer1 - c_pointer2 - // - // Thus, the statement claims that for the two VPointer we have: - // - // s_difference = c_difference (Statement) - // - // We prove the Statement with the help of a Lemma: - // - // Lemma: - // There is some integer x, such that: - // - // c_difference = s_difference + array_element_size_in_bytes * x * 2^32 (Lemma) - // - // From condition (DIFF), we can derive: - // - // abs(s_difference) < 2^31 (E) - // - // Assuming the Lemma, we prove the Statement: - // If "x = 0" (intuitively: the int_index does not overflow), then: - // c_difference = s_difference - // and hence the simple form computes the same pointer difference as the compound-long-int form. - // If "x != 0" (intuitively: the int_index overflows), then: - // abs(c_difference) >= abs(s_difference + array_element_size_in_bytes * x * 2^32) - // >= array_element_size_in_bytes * 2^32 - abs(s_difference) - // -- apply (E) -- - // > array_element_size_in_bytes * 2^32 - 2^31 - // >= array_element_size_in_bytes * 2^31 - // -- apply (ARR) -- - // >= max_possible_array_size_in_bytes - // >= array_size_in_bytes - // - // This shows that c_pointer1 and c_pointer2 have a distance that exceeds the maximum array size. - // Thus, at least one of the two pointers must be outside of the array bounds. But we can assume - // that out-of-bounds accesses do not happen. If they still do, it is undefined behavior. Hence, - // we are allowed to do anything. We can also "safely" use the simple form in this case even though - // it might not match the compound-long-int form at runtime. - // QED Statement. - // - // We must now prove the Lemma. - // - // ConvI2L always truncates by some power of 2^32, i.e. there is some integer y such that: - // - // ConvI2L(y1 + y2) = ConvI2L(y1) + ConvI2L(y2) + 2^32 * y (F) - // - // It follows, that there is an integer y1 such that: - // - // ConvI2L(int_index1) = ConvI2L(int_offset1 + int_invar1 + int_scale1 * iv) - // -- apply (F) -- - // = ConvI2L(int_offset1) - // + ConvI2L(int_invar1) - // + ConvI2L(int_scale1) * ConvI2L(iv) - // + y1 * 2^32 (G) - // - // Thus, we can write the compound-long-int form (D1) as: - // - // c_pointer1 = adr + long_offset1 + long_invar1 + long_scale1 * ConvI2L(int_index1) - // -- apply (G) -- - // = adr - // + long_offset1 - // + long_invar1 - // + long_scale1 * ConvI2L(int_offset1) - // + long_scale1 * ConvI2L(int_invar1) - // + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv) - // + long_scale1 * y1 * 2^32 (H) - // - // And we can write the simple form as: - // - // s_pointer1 = adr + offset1 + invar + scale * ConvI2L(iv) - // -- apply (D5, D7, D8) -- - // = adr - // + long_offset1 - // + long_scale1 * ConvI2L(int_offset1) - // + long_invar1 - // + long_scale1 * ConvI2L(int_invar1) - // + long_scale1 * ConvI2L(int_scale1) * ConvI2L(iv) (K) - // - // We now compute the pointer difference between the simple (K) and compound-long-int form (H). - // Most terms cancel out immediately: - // - // sc_difference1 = c_pointer1 - s_pointer1 = long_scale1 * y1 * 2^32 (L) - // - // Rearranging the equation (L), we get: - // - // c_pointer1 = s_pointer1 + long_scale1 * y1 * 2^32 (M) - // - // And since long_scale1 is a multiple of array_element_size_in_bytes, there is some integer - // x1, such that (M) implies: - // - // c_pointer1 = s_pointer1 + array_element_size_in_bytes * x1 * 2^32 (N) - // - // With an analogue equation for c_pointer2, we can now compute the pointer difference for - // the compound-long-int form: - // - // c_difference = c_pointer1 - c_pointer2 - // -- apply (N) -- - // = s_pointer1 + array_element_size_in_bytes * x1 * 2^32 - // -(s_pointer2 + array_element_size_in_bytes * x2 * 2^32) - // -- where "x = x1 - x2" -- - // = s_pointer1 - s_pointer2 + array_element_size_in_bytes * x * 2^32 - // -- apply (C) -- - // = s_difference + array_element_size_in_bytes * x * 2^32 - // QED Lemma. - if (ary_ptr_t != nullptr) { - BasicType array_element_bt = ary_ptr_t->elem()->array_element_basic_type(); - if (is_java_primitive(array_element_bt)) { - int array_element_size_in_bytes = type2aelembytes(array_element_bt); - if (abs(long_scale) % array_element_size_in_bytes == 0) { - return true; - } - } - } - - // General case: we do not know if it is safe to use the simple form. - return false; -#endif -} - -bool VPointer::is_loop_member(Node* n) const { - Node* n_c = phase()->get_ctrl(n); - return lpt()->is_member(phase()->get_loop(n_c)); -} - -bool VPointer::invariant(Node* n) const { - NOT_PRODUCT(Tracer::Depth dd;) - bool is_not_member = !is_loop_member(n); - if (is_not_member) { - CountedLoopNode* cl = lpt()->_head->as_CountedLoop(); - if (cl->is_main_loop()) { - // Check that n_c dominates the pre loop head node. If it does not, then - // we cannot use n as invariant for the pre loop CountedLoopEndNode check - // because n_c is either part of the pre loop or between the pre and the - // main loop (Illegal invariant happens when n_c is a CastII node that - // prevents data nodes to flow above the main loop). - Node* n_c = phase()->get_ctrl(n); - return phase()->is_dominator(n_c, _vloop.pre_loop_head()); - } - } - return is_not_member; -} - -// Match: k*iv + offset -// where: k is a constant that maybe zero, and -// offset is (k2 [+/- invariant]) where k2 maybe zero and invariant is optional -bool VPointer::scaled_iv_plus_offset(Node* n) { - NOT_PRODUCT(Tracer::Depth ddd;) - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_1(n);) - - if (scaled_iv(n)) { - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_2(n);) - return true; - } - - if (offset_plus_k(n)) { - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_3(n);) - return true; - } - - int opc = n->Opcode(); - if (opc == Op_AddI) { - if (offset_plus_k(n->in(2)) && scaled_iv_plus_offset(n->in(1))) { - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_4(n);) - return true; - } - if (offset_plus_k(n->in(1)) && scaled_iv_plus_offset(n->in(2))) { - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_5(n);) - return true; - } - } else if (opc == Op_SubI || opc == Op_SubL) { - if (offset_plus_k(n->in(2), true) && scaled_iv_plus_offset(n->in(1))) { - // (offset1 + invar1 + scale * iv) - (offset2 + invar2) - // Subtraction handled via "negate" flag of "offset_plus_k". - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_6(n);) - return true; - } - VPointer tmp(this); - if (offset_plus_k(n->in(1)) && tmp.scaled_iv_plus_offset(n->in(2))) { - // (offset1 + invar1) - (offset2 + invar2 + scale * iv) - // Subtraction handled explicitly below. - assert(_scale == 0, "shouldn't be set yet"); - // _scale = -tmp._scale - if (!try_MulI_no_overflow(-1, tmp._scale, _scale)) { - return false; // mul overflow. - } - // _offset -= tmp._offset - if (!try_SubI_no_overflow(_offset, tmp._offset, _offset)) { - return false; // sub overflow. - } - // _invar -= tmp._invar - if (tmp._invar != nullptr) { - maybe_add_to_invar(tmp._invar, true); -#ifdef ASSERT - _debug_invar_scale = tmp._debug_invar_scale; - _debug_negate_invar = !tmp._debug_negate_invar; -#endif - } - - // Forward info about the int_index: - assert(!_has_int_index_after_convI2L, "no previous int_index discovered"); - _has_int_index_after_convI2L = tmp._has_int_index_after_convI2L; - _int_index_after_convI2L_offset = tmp._int_index_after_convI2L_offset; - _int_index_after_convI2L_invar = tmp._int_index_after_convI2L_invar; - _int_index_after_convI2L_scale = tmp._int_index_after_convI2L_scale; - - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_7(n);) - return true; - } - } - - NOT_PRODUCT(_tracer.scaled_iv_plus_offset_8(n);) - return false; -} - -// Match: k*iv where k is a constant that's not zero -bool VPointer::scaled_iv(Node* n) { - NOT_PRODUCT(Tracer::Depth ddd;) - NOT_PRODUCT(_tracer.scaled_iv_1(n);) - - if (_scale != 0) { // already found a scale - NOT_PRODUCT(_tracer.scaled_iv_2(n, _scale);) - return false; - } - - if (n == iv()) { - _scale = 1; - NOT_PRODUCT(_tracer.scaled_iv_3(n, _scale);) - return true; - } - if (_analyze_only && (is_loop_member(n))) { - _nstack->push(n, _stack_idx++); - } - - int opc = n->Opcode(); - if (opc == Op_MulI) { - if (n->in(1) == iv() && n->in(2)->is_Con()) { - _scale = n->in(2)->get_int(); - NOT_PRODUCT(_tracer.scaled_iv_4(n, _scale);) - return true; - } else if (n->in(2) == iv() && n->in(1)->is_Con()) { - _scale = n->in(1)->get_int(); - NOT_PRODUCT(_tracer.scaled_iv_5(n, _scale);) - return true; - } - } else if (opc == Op_LShiftI) { - if (n->in(1) == iv() && n->in(2)->is_Con()) { - if (!try_LShiftI_no_overflow(1, n->in(2)->get_int(), _scale)) { - return false; // shift overflow. - } - NOT_PRODUCT(_tracer.scaled_iv_6(n, _scale);) - return true; - } - } else if (opc == Op_ConvI2L && !has_iv()) { - // So far we have not found the iv yet, and are about to enter a ConvI2L subgraph, - // which may be the int index (that might overflow) for the memory access, of the form: - // - // int_index = int_offset + int_invar + int_scale * iv - // - // If we simply continue parsing with the current VPointer, then the int_offset and - // int_invar simply get added to the long offset and invar. But for the checks in - // VPointer::is_safe_to_use_as_simple_form() we need to have explicit access to the - // int_index. Thus, we must parse it explicitly here. For this, we use a temporary - // VPointer, to pattern match the int_index sub-expression of the address. - - NOT_PRODUCT(Tracer::Depth dddd;) - VPointer tmp(this); - NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);) - - if (tmp.scaled_iv_plus_offset(n->in(1)) && tmp.has_iv()) { - // We successfully matched an integer index, of the form: - // int_index = int_offset + int_invar + int_scale * iv - // Forward scale. - assert(_scale == 0 && tmp._scale != 0, "iv only found just now"); - _scale = tmp._scale; - // Accumulate offset. - if (!try_AddI_no_overflow(_offset, tmp._offset, _offset)) { - return false; // add overflow. - } - // Accumulate invar. - if (tmp._invar != nullptr) { - maybe_add_to_invar(tmp._invar, false); - } - // Set info about the int_index: - assert(!_has_int_index_after_convI2L, "no previous int_index discovered"); - _has_int_index_after_convI2L = true; - _int_index_after_convI2L_offset = tmp._offset; - _int_index_after_convI2L_invar = tmp._invar; - _int_index_after_convI2L_scale = tmp._scale; - - NOT_PRODUCT(_tracer.scaled_iv_7(n);) - return true; - } - } else if (opc == Op_ConvI2L || opc == Op_CastII) { - if (scaled_iv_plus_offset(n->in(1))) { - NOT_PRODUCT(_tracer.scaled_iv_7(n);) - return true; - } - } else if (opc == Op_LShiftL && n->in(2)->is_Con()) { - if (!has_iv()) { - // Need to preserve the current _offset value, so - // create a temporary object for this expression subtree. - // Hacky, so should re-engineer the address pattern match. - NOT_PRODUCT(Tracer::Depth dddd;) - VPointer tmp(this); - NOT_PRODUCT(_tracer.scaled_iv_8(n, &tmp);) - - if (tmp.scaled_iv_plus_offset(n->in(1))) { - int shift = n->in(2)->get_int(); - // Accumulate scale. - if (!try_LShiftI_no_overflow(tmp._scale, shift, _scale)) { - return false; // shift overflow. - } - // Accumulate offset. - int shifted_offset = 0; - if (!try_LShiftI_no_overflow(tmp._offset, shift, shifted_offset)) { - return false; // shift overflow. - } - if (!try_AddI_no_overflow(_offset, shifted_offset, _offset)) { - return false; // add overflow. - } - // Accumulate invar. - if (tmp._invar != nullptr) { - BasicType bt = tmp._invar->bottom_type()->basic_type(); - assert(bt == T_INT || bt == T_LONG, ""); - maybe_add_to_invar(register_if_new(LShiftNode::make(tmp._invar, n->in(2), bt)), false); -#ifdef ASSERT - _debug_invar_scale = n->in(2); -#endif - } - - // Forward info about the int_index: - assert(!_has_int_index_after_convI2L, "no previous int_index discovered"); - _has_int_index_after_convI2L = tmp._has_int_index_after_convI2L; - _int_index_after_convI2L_offset = tmp._int_index_after_convI2L_offset; - _int_index_after_convI2L_invar = tmp._int_index_after_convI2L_invar; - _int_index_after_convI2L_scale = tmp._int_index_after_convI2L_scale; - - NOT_PRODUCT(_tracer.scaled_iv_9(n, _scale, _offset, _invar);) - return true; - } - } - } - NOT_PRODUCT(_tracer.scaled_iv_10(n);) - return false; -} - -// Match: offset is (k [+/- invariant]) -// where k maybe zero and invariant is optional, but not both. -bool VPointer::offset_plus_k(Node* n, bool negate) { - NOT_PRODUCT(Tracer::Depth ddd;) - NOT_PRODUCT(_tracer.offset_plus_k_1(n);) - - int opc = n->Opcode(); - if (opc == Op_ConI) { - if (!try_AddSubI_no_overflow(_offset, n->get_int(), negate, _offset)) { - return false; // add/sub overflow. - } - NOT_PRODUCT(_tracer.offset_plus_k_2(n, _offset);) - return true; - } else if (opc == Op_ConL) { - // Okay if value fits into an int - const TypeLong* t = n->find_long_type(); - if (t->higher_equal(TypeLong::INT)) { - jlong loff = n->get_long(); - jint off = (jint)loff; - if (!try_AddSubI_no_overflow(_offset, off, negate, _offset)) { - return false; // add/sub overflow. - } - NOT_PRODUCT(_tracer.offset_plus_k_3(n, _offset);) - return true; - } - NOT_PRODUCT(_tracer.offset_plus_k_4(n);) - return false; - } - assert((_debug_invar == nullptr) == (_invar == nullptr), ""); - - if (_analyze_only && is_loop_member(n)) { - _nstack->push(n, _stack_idx++); - } - if (opc == Op_AddI) { - if (n->in(2)->is_Con() && invariant(n->in(1))) { - maybe_add_to_invar(n->in(1), negate); - if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), negate, _offset)) { - return false; // add/sub overflow. - } - NOT_PRODUCT(_tracer.offset_plus_k_6(n, _invar, negate, _offset);) - return true; - } else if (n->in(1)->is_Con() && invariant(n->in(2))) { - if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) { - return false; // add/sub overflow. - } - maybe_add_to_invar(n->in(2), negate); - NOT_PRODUCT(_tracer.offset_plus_k_7(n, _invar, negate, _offset);) - return true; - } - } - if (opc == Op_SubI) { - if (n->in(2)->is_Con() && invariant(n->in(1))) { - maybe_add_to_invar(n->in(1), negate); - if (!try_AddSubI_no_overflow(_offset, n->in(2)->get_int(), !negate, _offset)) { - return false; // add/sub overflow. - } - NOT_PRODUCT(_tracer.offset_plus_k_8(n, _invar, negate, _offset);) - return true; - } else if (n->in(1)->is_Con() && invariant(n->in(2))) { - if (!try_AddSubI_no_overflow(_offset, n->in(1)->get_int(), negate, _offset)) { - return false; // add/sub overflow. - } - maybe_add_to_invar(n->in(2), !negate); - NOT_PRODUCT(_tracer.offset_plus_k_9(n, _invar, !negate, _offset);) - return true; - } - } - - if (!is_loop_member(n)) { - // 'n' is loop invariant. Skip ConvI2L and CastII nodes before checking if 'n' is dominating the pre loop. - if (opc == Op_ConvI2L) { - n = n->in(1); - } - if (n->Opcode() == Op_CastII) { - // Skip CastII nodes - assert(!is_loop_member(n), "sanity"); - n = n->in(1); - } - // Check if 'n' can really be used as invariant (not in main loop and dominating the pre loop). - if (invariant(n)) { - maybe_add_to_invar(n, negate); - NOT_PRODUCT(_tracer.offset_plus_k_10(n, _invar, negate, _offset);) - return true; - } - } - - NOT_PRODUCT(_tracer.offset_plus_k_11(n);) - return false; -} - -Node* VPointer::maybe_negate_invar(bool negate, Node* invar) { -#ifdef ASSERT - _debug_negate_invar = negate; -#endif - if (negate) { - BasicType bt = invar->bottom_type()->basic_type(); - assert(bt == T_INT || bt == T_LONG, ""); - PhaseIterGVN& igvn = phase()->igvn(); - Node* zero = igvn.zerocon(bt); - phase()->set_ctrl(zero, phase()->C->root()); - Node* sub = SubNode::make(zero, invar, bt); - invar = register_if_new(sub); - } - return invar; -} - -Node* VPointer::register_if_new(Node* n) const { - PhaseIterGVN& igvn = phase()->igvn(); - Node* prev = igvn.hash_find_insert(n); - if (prev != nullptr) { - n->destruct(&igvn); - n = prev; - } else { - Node* c = phase()->get_early_ctrl(n); - phase()->register_new_node(n, c); - } - return n; -} - -void VPointer::maybe_add_to_invar(Node* new_invar, bool negate) { - new_invar = maybe_negate_invar(negate, new_invar); - if (_invar == nullptr) { - _invar = new_invar; -#ifdef ASSERT - _debug_invar = new_invar; -#endif - return; - } -#ifdef ASSERT - _debug_invar = NodeSentinel; -#endif - BasicType new_invar_bt = new_invar->bottom_type()->basic_type(); - assert(new_invar_bt == T_INT || new_invar_bt == T_LONG, ""); - BasicType invar_bt = _invar->bottom_type()->basic_type(); - assert(invar_bt == T_INT || invar_bt == T_LONG, ""); - - BasicType bt = (new_invar_bt == T_LONG || invar_bt == T_LONG) ? T_LONG : T_INT; - Node* current_invar = _invar; - if (invar_bt != bt) { - assert(bt == T_LONG && invar_bt == T_INT, ""); - assert(new_invar_bt == bt, ""); - current_invar = register_if_new(new ConvI2LNode(current_invar)); - } else if (new_invar_bt != bt) { - assert(bt == T_LONG && new_invar_bt == T_INT, ""); - assert(invar_bt == bt, ""); - new_invar = register_if_new(new ConvI2LNode(new_invar)); - } - Node* add = AddNode::make(current_invar, new_invar, bt); - _invar = register_if_new(add); -} - -bool VPointer::try_AddI_no_overflow(int offset1, int offset2, int& result) { - jlong long_offset = java_add((jlong)(offset1), (jlong)(offset2)); - jint int_offset = java_add( offset1, offset2); - if (long_offset != int_offset) { - return false; - } - result = int_offset; - return true; -} - -bool VPointer::try_SubI_no_overflow(int offset1, int offset2, int& result) { - jlong long_offset = java_subtract((jlong)(offset1), (jlong)(offset2)); - jint int_offset = java_subtract( offset1, offset2); - if (long_offset != int_offset) { - return false; - } - result = int_offset; - return true; -} - -bool VPointer::try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result) { - if (is_sub) { - return try_SubI_no_overflow(offset1, offset2, result); - } else { - return try_AddI_no_overflow(offset1, offset2, result); - } -} - -bool VPointer::try_LShiftI_no_overflow(int offset, int shift, int& result) { - if (shift < 0 || shift > 31) { - return false; - } - jlong long_offset = java_shift_left((jlong)(offset), shift); - jint int_offset = java_shift_left( offset, shift); - if (long_offset != int_offset) { - return false; - } - result = int_offset; - return true; -} - -bool VPointer::try_MulI_no_overflow(int offset1, int offset2, int& result) { - jlong long_offset = java_multiply((jlong)(offset1), (jlong)(offset2)); - jint int_offset = java_multiply( offset1, offset2); - if (long_offset != int_offset) { - return false; - } - result = int_offset; - return true; -} - // We use two comparisons, because a subtraction could underflow. #define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ if (a < b) { return -1; } \ @@ -1380,285 +493,6 @@ void VPointer::print() const { } #endif -// Following are functions for tracing VPointer match -#ifndef PRODUCT -void VPointer::Tracer::print_depth() const { - for (int ii = 0; ii < _depth; ++ii) { - tty->print(" "); - } -} - -void VPointer::Tracer::ctor_1(const Node* mem) { - if (_is_trace_alignment) { - print_depth(); tty->print(" %d VPointer::VPointer: start alignment analysis", mem->_idx); mem->dump(); - } -} - -void VPointer::Tracer::ctor_2(Node* adr) { - if (_is_trace_alignment) { - //store_depth(); - inc_depth(); - print_depth(); tty->print(" %d (adr) VPointer::VPointer: ", adr->_idx); adr->dump(); - inc_depth(); - print_depth(); tty->print(" %d (base) VPointer::VPointer: ", adr->in(AddPNode::Base)->_idx); adr->in(AddPNode::Base)->dump(); - } -} - -void VPointer::Tracer::ctor_3(Node* adr, int i) { - if (_is_trace_alignment) { - inc_depth(); - Node* offset = adr->in(AddPNode::Offset); - print_depth(); tty->print(" %d (offset) VPointer::VPointer: i = %d: ", offset->_idx, i); offset->dump(); - } -} - -void VPointer::Tracer::ctor_4(Node* adr, int i) { - if (_is_trace_alignment) { - inc_depth(); - print_depth(); tty->print(" %d (adr) VPointer::VPointer: i = %d: ", adr->_idx, i); adr->dump(); - } -} - -void VPointer::Tracer::ctor_5(Node* adr, Node* base, int i) { - if (_is_trace_alignment) { - inc_depth(); - if (base == adr) { - print_depth(); tty->print_cr(" \\ %d (adr) == %d (base) VPointer::VPointer: breaking analysis at i = %d", adr->_idx, base->_idx, i); - } else if (!adr->is_AddP()) { - print_depth(); tty->print_cr(" \\ %d (adr) is NOT Addp VPointer::VPointer: breaking analysis at i = %d", adr->_idx, i); - } - } -} - -void VPointer::Tracer::ctor_6(const Node* mem) { - if (_is_trace_alignment) { - //restore_depth(); - print_depth(); tty->print_cr(" %d (adr) VPointer::VPointer: stop analysis", mem->_idx); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_1(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print(" %d VPointer::scaled_iv_plus_offset testing node: ", n->_idx); - n->dump(); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_2(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_3(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: PASSED", n->_idx); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_4(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_5(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_AddI PASSED", n->_idx); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_6(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name()); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is scaled_iv: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is offset_plus_k: ", n->in(2)->_idx); n->in(2)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_7(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: Op_%s PASSED", n->_idx, n->Name()); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(2) is scaled_iv: ", n->in(2)->_idx); n->in(2)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv_plus_offset: in(1) is offset_plus_k: ", n->in(1)->_idx); n->in(1)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_plus_offset_8(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv_plus_offset: FAILED", n->_idx); - } -} - -void VPointer::Tracer::scaled_iv_1(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print(" %d VPointer::scaled_iv: testing node: ", n->_idx); n->dump(); - } -} - -void VPointer::Tracer::scaled_iv_2(Node* n, int scale) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED since another _scale has been detected before", n->_idx); - print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: _scale (%d) != 0", scale); - } -} - -void VPointer::Tracer::scaled_iv_3(Node* n, int scale) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: is iv, setting _scale = %d", n->_idx, scale); - } -} - -void VPointer::Tracer::scaled_iv_4(Node* n, int scale) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_5(Node* n, int scale) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_MulI PASSED, setting _scale = %d", n->_idx, scale); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is iv: ", n->in(2)->_idx); n->in(2)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_6(Node* n, int scale) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftI PASSED, setting _scale = %d", n->_idx, scale); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(1) is iv: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::scaled_iv: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); - } -} - -void VPointer::Tracer::scaled_iv_7(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_ConvI2L PASSED", n->_idx); - print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: in(1) %d is scaled_iv_plus_offset: ", n->in(1)->_idx); - inc_depth(); inc_depth(); - print_depth(); n->in(1)->dump(); - dec_depth(); dec_depth(); - } -} - -void VPointer::Tracer::scaled_iv_8(Node* n, VPointer* tmp) { - if (_is_trace_alignment) { - print_depth(); tty->print(" %d VPointer::scaled_iv: Op_LShiftL, creating tmp VPointer: ", n->_idx); tmp->print(); - } -} - -void VPointer::Tracer::scaled_iv_9(Node* n, int scale, int offset, Node* invar) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: Op_LShiftL PASSED, setting _scale = %d, _offset = %d", n->_idx, scale, offset); - print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: in(1) [%d] is scaled_iv_plus_offset, in(2) [%d] used to scale: _scale = %d, _offset = %d", - n->in(1)->_idx, n->in(2)->_idx, scale, offset); - if (invar != nullptr) { - print_depth(); tty->print_cr(" \\ VPointer::scaled_iv: scaled invariant: [%d]", invar->_idx); - } - inc_depth(); inc_depth(); - print_depth(); n->in(1)->dump(); - print_depth(); n->in(2)->dump(); - if (invar != nullptr) { - print_depth(); invar->dump(); - } - dec_depth(); dec_depth(); - } -} - -void VPointer::Tracer::scaled_iv_10(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::scaled_iv: FAILED", n->_idx); - } -} - -void VPointer::Tracer::offset_plus_k_1(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print(" %d VPointer::offset_plus_k: testing node: ", n->_idx); n->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_2(Node* n, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConI PASSED, setting _offset = %d", n->_idx, _offset); - } -} - -void VPointer::Tracer::offset_plus_k_3(Node* n, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_ConL PASSED, setting _offset = %d", n->_idx, _offset); - } -} - -void VPointer::Tracer::offset_plus_k_4(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx); - print_depth(); tty->print_cr(" \\ " JLONG_FORMAT " VPointer::offset_plus_k: Op_ConL FAILED, k is too big", n->get_long()); - } -} - -void VPointer::Tracer::offset_plus_k_5(Node* n, Node* _invar) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED since another invariant has been detected before", n->_idx); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: _invar is not null: ", _invar->_idx); _invar->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", - n->_idx, _negate_invar, _invar->_idx, _offset); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_AddI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", - n->_idx, _negate_invar, _invar->_idx, _offset); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI is PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", - n->_idx, _negate_invar, _invar->_idx, _offset); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is Con: ", n->in(2)->_idx); n->in(2)->dump(); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is invariant: ", _invar->_idx); _invar->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: Op_SubI PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(1) is Con: ", n->in(1)->_idx); n->in(1)->dump(); - print_depth(); tty->print(" \\ %d VPointer::offset_plus_k: in(2) is invariant: ", _invar->_idx); _invar->dump(); - } -} - -void VPointer::Tracer::offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: PASSED, setting _debug_negate_invar = %d, _invar = %d, _offset = %d", n->_idx, _negate_invar, _invar->_idx, _offset); - print_depth(); tty->print_cr(" \\ %d VPointer::offset_plus_k: is invariant", n->_idx); - } -} - -void VPointer::Tracer::offset_plus_k_11(Node* n) { - if (_is_trace_alignment) { - print_depth(); tty->print_cr(" %d VPointer::offset_plus_k: FAILED", n->_idx); - } -} -#endif - - AlignmentSolution* AlignmentSolver::solve() const { DEBUG_ONLY( trace_start_solve(); ) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b084edd44b339..a92959d418277 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -28,6 +28,7 @@ #include "opto/matcher.hpp" #include "opto/loopnode.hpp" #include "opto/traceAutoVectorizationTag.hpp" +#include "opto/mempointer.hpp" #include "utilities/pair.hpp" // Code in this file and the vectorization.cpp contains shared logics and @@ -667,45 +668,41 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// A vectorization pointer (VPointer) has information about an address for -// dependence checking and vector alignment. It's usually bound to a memory -// operation in a counted loop for vectorizable analysis. -// -// We parse and represent pointers of the simple form: -// -// pointer = adr + offset + invar + scale * ConvI2L(iv) -// -// Where: -// -// adr: the base address of an array (base = adr) -// OR -// some address to off-heap memory (base = TOP) -// -// offset: a constant offset -// invar: a runtime variable, which is invariant during the loop -// scale: scaling factor -// iv: loop induction variable -// -// But more precisely, we parse the composite-long-int form: -// -// pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_offset + inv_invar + int_scale * iv) -// -// pointer = adr + long_offset + long_invar + long_scale * ConvI2L(int_index) -// int_index = int_offset + int_invar + int_scale * iv -// -// However, for aliasing and adjacency checks (e.g. VPointer::cmp()) we always use the simple form to make -// decisions. Hence, we must make sure to only create a "valid" VPointer if the optimisations based on the -// simple form produce the same result as the compound-long-int form would. Intuitively, this depends on -// if the int_index overflows, but the precise conditions are given in VPointer::is_safe_to_use_as_simple_form(). -// -// ConvI2L(int_index) = ConvI2L(int_offset + int_invar + int_scale * iv) -// = Convi2L(int_offset) + ConvI2L(int_invar) + ConvI2L(int_scale) * ConvI2L(iv) -// -// scale = long_scale * ConvI2L(int_scale) -// offset = long_offset + long_scale * ConvI2L(int_offset) -// invar = long_invar + long_scale * ConvI2L(int_invar) +// TODO +// vpointer = base + con + invar + iv_scale * iv +class XPointer : public ArenaObj { +private: + bool _is_valid; + + Node* const _base; + jint _con; + Node* const _invar; + jint _invar_alignment; + jint _iv_scale; + + jint _size; + +public: + // Default constructor, e.g. for GrowableArray. + XPointer() : + _is_valid(false), + _base(nullptr), + _con(0), + _invar(0), + _invar_alignment(0), + _iv_scale(0), + _size(0) {} + + bool is_valid() const { return _is_valid; } + + + // TODO + // , _tracer(vloop.is_trace_pointer_analysis()) +}; + +// TODO rm +// vpointer = base + con + invar + scale * iv // -// pointer = adr + offset + invar + scale * ConvI2L(iv) // class VPointer : public ArenaObj { protected: @@ -725,13 +722,6 @@ class VPointer : public ArenaObj { Node* _debug_invar_scale; // multiplier for invariant #endif - // The int_index components of the compound-long-int form. Used to decide if it is safe to use the - // simple form rather than the compound-long-int form that was parsed. - bool _has_int_index_after_convI2L; - int _int_index_after_convI2L_offset; - Node* _int_index_after_convI2L_invar; - int _int_index_after_convI2L_scale; - Node_Stack* _nstack; // stack used to record a vpointer trace of variants bool _analyze_only; // Used in loop unrolling only for vpointer trace uint _stack_idx; // Used in loop unrolling only for vpointer trace @@ -740,16 +730,6 @@ class VPointer : public ArenaObj { IdealLoopTree* lpt() const { return _vloop.lpt(); } PhiNode* iv() const { return _vloop.iv(); } - bool is_loop_member(Node* n) const; - bool invariant(Node* n) const; - - // Match: k*iv + offset - bool scaled_iv_plus_offset(Node* n); - // Match: k*iv where k is a constant that's not zero - bool scaled_iv(Node* n); - // Match: offset is (k [+/- invariant]) - bool offset_plus_k(Node* n, bool negate = false); - public: enum CMP { Less = 1, @@ -766,13 +746,8 @@ class VPointer : public ArenaObj { private: VPointer(MemNode* const mem, const VLoop& vloop, Node_Stack* nstack, bool analyze_only); - // Following is used to create a temporary object during - // the pattern match of an address expression. - VPointer(VPointer* p); NONCOPYABLE(VPointer); - bool is_safe_to_use_as_simple_form(Node* base, Node* adr) const; - public: bool valid() const { return _adr != nullptr; } bool has_iv() const { return _scale != 0; } @@ -870,82 +845,6 @@ class VPointer : public ArenaObj { static int cmp_for_sort(const VPointer** p1, const VPointer** p2); NOT_PRODUCT( void print() const; ) - -#ifndef PRODUCT - class Tracer { - friend class VPointer; - bool _is_trace_alignment; - static int _depth; - int _depth_save; - void print_depth() const; - int depth() const { return _depth; } - void set_depth(int d) { _depth = d; } - void inc_depth() { _depth++; } - void dec_depth() { if (_depth > 0) _depth--; } - void store_depth() { _depth_save = _depth; } - void restore_depth() { _depth = _depth_save; } - - class Depth { - friend class VPointer; - Depth() { ++_depth; } - Depth(int x) { _depth = 0; } - ~Depth() { if (_depth > 0) --_depth; } - }; - Tracer(bool is_trace_alignment) : _is_trace_alignment(is_trace_alignment) {} - - // tracing functions - void ctor_1(const Node* mem); - void ctor_2(Node* adr); - void ctor_3(Node* adr, int i); - void ctor_4(Node* adr, int i); - void ctor_5(Node* adr, Node* base, int i); - void ctor_6(const Node* mem); - - void scaled_iv_plus_offset_1(Node* n); - void scaled_iv_plus_offset_2(Node* n); - void scaled_iv_plus_offset_3(Node* n); - void scaled_iv_plus_offset_4(Node* n); - void scaled_iv_plus_offset_5(Node* n); - void scaled_iv_plus_offset_6(Node* n); - void scaled_iv_plus_offset_7(Node* n); - void scaled_iv_plus_offset_8(Node* n); - - void scaled_iv_1(Node* n); - void scaled_iv_2(Node* n, int scale); - void scaled_iv_3(Node* n, int scale); - void scaled_iv_4(Node* n, int scale); - void scaled_iv_5(Node* n, int scale); - void scaled_iv_6(Node* n, int scale); - void scaled_iv_7(Node* n); - void scaled_iv_8(Node* n, VPointer* tmp); - void scaled_iv_9(Node* n, int _scale, int _offset, Node* _invar); - void scaled_iv_10(Node* n); - - void offset_plus_k_1(Node* n); - void offset_plus_k_2(Node* n, int _offset); - void offset_plus_k_3(Node* n, int _offset); - void offset_plus_k_4(Node* n); - void offset_plus_k_5(Node* n, Node* _invar); - void offset_plus_k_6(Node* n, Node* _invar, bool _negate_invar, int _offset); - void offset_plus_k_7(Node* n, Node* _invar, bool _negate_invar, int _offset); - void offset_plus_k_8(Node* n, Node* _invar, bool _negate_invar, int _offset); - void offset_plus_k_9(Node* n, Node* _invar, bool _negate_invar, int _offset); - void offset_plus_k_10(Node* n, Node* _invar, bool _negate_invar, int _offset); - void offset_plus_k_11(Node* n); - } _tracer; // Tracer -#endif - - Node* maybe_negate_invar(bool negate, Node* invar); - - void maybe_add_to_invar(Node* new_invar, bool negate); - - static bool try_AddI_no_overflow(int offset1, int offset2, int& result); - static bool try_SubI_no_overflow(int offset1, int offset2, int& result); - static bool try_AddSubI_no_overflow(int offset1, int offset2, bool is_sub, int& result); - static bool try_LShiftI_no_overflow(int offset1, int offset2, int& result); - static bool try_MulI_no_overflow(int offset1, int offset2, int& result); - - Node* register_if_new(Node* n) const; }; From d69fc7f0f6cb99cba708c8ad0a359a5aa5d6c422 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 6 Nov 2024 16:29:25 +0100 Subject: [PATCH 002/130] first steps using MemPointerDecomposedForm --- src/hotspot/share/opto/mempointer.hpp | 28 ++++++----- src/hotspot/share/opto/superword.cpp | 3 ++ src/hotspot/share/opto/vectorization.cpp | 13 +++++ src/hotspot/share/opto/vectorization.hpp | 60 +++++++++++++++++------- 4 files changed, 74 insertions(+), 30 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 1e5b2c00b8822..6f60835bdfcc1 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -461,9 +461,8 @@ class MemPointerSummand : public StackObj { #ifndef PRODUCT void print_on(outputStream* st) const { - st->print("Summand["); _scale.print_on(st); - tty->print(" * [%d %s]]", _variable->_idx, _variable->Name()); + tty->print(" * [%d %s]", _variable->_idx, _variable->Name()); } #endif }; @@ -473,13 +472,14 @@ class MemPointerSummand : public StackObj { // pointer = SUM(summands) + con // class MemPointerDecomposedForm : public StackObj { -private: +public: // We limit the number of summands to 10. This is just a best guess, and not at this // point supported by evidence. But I think it is reasonable: usually, a pointer // contains a base pointer (e.g. array pointer or null for native memory) and a few // variables. It should be rare that we have more than 9 variables. static const int SUMMANDS_SIZE = 10; +private: Node* _pointer; // pointer node associated with this (sub)pointer MemPointerSummand _summands[SUMMANDS_SIZE]; @@ -524,7 +524,7 @@ class MemPointerDecomposedForm : public StackObj { MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const; - const MemPointerSummand summands_at(const uint i) const { + const MemPointerSummand& summands_at(const uint i) const { assert(i < SUMMANDS_SIZE, "in bounds"); return _summands[i]; } @@ -532,20 +532,24 @@ class MemPointerDecomposedForm : public StackObj { const NoOverflowInt con() const { return _con; } #ifndef PRODUCT - void print_on(outputStream* st) const { - if (_pointer == nullptr) { - st->print_cr("MemPointerDecomposedForm empty."); - return; - } - st->print("MemPointerDecomposedForm[%d %s: con = ", _pointer->_idx, _pointer->Name()); + void print_form_on(outputStream* st) const { _con.print_on(st); for (int i = 0; i < SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _summands[i]; if (summand.variable() != nullptr) { - st->print(", "); + st->print(" + "); summand.print_on(st); } } + } + + void print_on(outputStream* st) const { + if (_pointer == nullptr) { + st->print_cr("MemPointerDecomposedForm empty."); + return; + } + st->print("MemPointerDecomposedForm[%d %s: form = ", _pointer->_idx, _pointer->Name()); + print_form_on(st); st->print_cr("]"); } #endif @@ -568,7 +572,7 @@ class MemPointerDecomposedFormParser : public StackObj { _decomposed_form = parse_decomposed_form(); } - const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; } + const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } private: MemPointerDecomposedForm parse_decomposed_form(); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 20c8dfbff1776..2a85b544b7f57 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -141,6 +141,9 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa // Mark the components of the memory operation in nstack VPointer p1(current, vloop, &nstack); have_side_effects = p1.node_stack()->is_nonempty(); + + XPointer xp(current, vloop); + NOT_PRODUCT( xp.print_on(tty); ) } // Process the pointer stack diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 9259f6bbe29f1..aec4651727372 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -469,6 +469,19 @@ int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) { } #ifndef PRODUCT +void XPointer::print_on(outputStream* st) const { + st->print("XPointer["); + + if (!is_valid()) { + st->print_cr("invalid]"); + return; + } + + st->print("size = %2d, form = ", _size); + _decomposed_form.print_form_on(st); + st->print_cr("]"); +} + // Function for printing the fields of a VPointer void VPointer::print() const { tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name()); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index a92959d418277..0a85d44cba544 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -669,35 +669,59 @@ class VLoopAnalyzer : StackObj { }; // TODO -// vpointer = base + con + invar + iv_scale * iv class XPointer : public ArenaObj { private: - bool _is_valid; - - Node* const _base; - jint _con; - Node* const _invar; - jint _invar_alignment; - jint _iv_scale; - - jint _size; + const MemPointerDecomposedForm _decomposed_form; + const jint _size; + const bool _is_valid; public: // Default constructor, e.g. for GrowableArray. XPointer() : - _is_valid(false), - _base(nullptr), - _con(0), - _invar(0), - _invar_alignment(0), - _iv_scale(0), - _size(0) {} + _decomposed_form(), + _size(0), + _is_valid(false) {} + + XPointer(const MemNode* mem, const VLoop& vloop) : + _decomposed_form(init_decomposed_form(mem)), + _size(mem->memory_size()), + _is_valid(init_is_valid(_decomposed_form, vloop)) {} + // Accessors bool is_valid() const { return _is_valid; } + // TODO + // if (vloop.is_trace_pointer_analysis()) { + NOT_PRODUCT( void print_on(outputStream* st) const; ) + +private: + static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem) { + assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); + ResourceMark rm; + MemPointerDecomposedFormParser parser(mem); + return parser.decomposed_form(); + } + + // Check that all variables are either the iv, or else invariants. + // TODO why invariant? + static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { + for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { + const MemPointerSummand& summand = decomposed_form.summands_at(i); + Node* variable = summand.variable(); + if (variable != nullptr && variable != vloop.iv() && !is_invariant(variable, vloop)) { + return false; + } + } + return true; + } // TODO - // , _tracer(vloop.is_trace_pointer_analysis()) + static bool is_invariant(Node* n, const VLoop& vloop) { + assert(vloop.cl()->is_main_loop(), "must be"); + Node* ctrl = vloop.phase()->get_ctrl(n); + // TODO + return true; + } }; // TODO rm From f1ab88d94bb8b4053b89df4e6acd1c70c8a38ed2 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 6 Nov 2024 17:30:37 +0100 Subject: [PATCH 003/130] work on invariant and stub of adr_node_callback --- src/hotspot/share/opto/superword.cpp | 5 ++++- src/hotspot/share/opto/vectorization.hpp | 25 +++++++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 2a85b544b7f57..3eaa1040391de 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -142,7 +142,10 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa VPointer p1(current, vloop, &nstack); have_side_effects = p1.node_stack()->is_nonempty(); - XPointer xp(current, vloop); + XPointer xp(current, vloop, [&] (Node* n) { + NOT_PRODUCT( n->dump(); ) + assert(false, "TODO"); + }); NOT_PRODUCT( xp.print_on(tty); ) } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 0a85d44cba544..01e3578c14c67 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -682,8 +682,9 @@ class XPointer : public ArenaObj { _size(0), _is_valid(false) {} - XPointer(const MemNode* mem, const VLoop& vloop) : - _decomposed_form(init_decomposed_form(mem)), + template + XPointer(const MemNode* mem, const VLoop& vloop, Callback adr_node_callback) : + _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), _is_valid(init_is_valid(_decomposed_form, vloop)) {} @@ -695,7 +696,8 @@ class XPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem) { + template + static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, Callback adr_node_callback) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; MemPointerDecomposedFormParser parser(mem); @@ -703,7 +705,7 @@ class XPointer : public ArenaObj { } // Check that all variables are either the iv, or else invariants. - // TODO why invariant? + // TODO why pre-loop static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = decomposed_form.summands_at(i); @@ -715,12 +717,21 @@ class XPointer : public ArenaObj { return true; } - // TODO + // TODO refactor to VLoop? + // Is it invariant of the loop, i.e. the main-loop and even the pre-loop? + // The invariants are used for alignment, in the exit check of the pre-loop, + // this is why we need invariance of even the pre-loop. static bool is_invariant(Node* n, const VLoop& vloop) { assert(vloop.cl()->is_main_loop(), "must be"); Node* ctrl = vloop.phase()->get_ctrl(n); - // TODO - return true; + + // Quick test: is it in the main-loop? + if (vloop.lpt()->is_member(vloop.phase()->get_loop(ctrl))) { + return false; + } + + // Is it before the pre-loop? + return vloop.phase()->is_dominator(ctrl, vloop.pre_loop_head()); } }; From 56e43c23ada88339ee604caabe8c0383cd105413 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 09:23:28 +0100 Subject: [PATCH 004/130] Callback refactor with object --- src/hotspot/share/opto/mempointer.cpp | 10 +++++++--- src/hotspot/share/opto/mempointer.hpp | 23 +++++++++++++++++++---- src/hotspot/share/opto/superword.cpp | 13 +++++++++---- src/hotspot/share/opto/vectorization.hpp | 9 +++++---- 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index df443c69449cb..764a2e144af80 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -28,7 +28,7 @@ // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form() { +MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(Callback& adr_node_callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); @@ -43,7 +43,7 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form() while (_worklist.is_nonempty()) { // Bail out if the graph is too complex. if (traversal_count++ > 1000) { return MemPointerDecomposedForm::make_trivial(pointer); } - parse_sub_expression(_worklist.pop()); + parse_sub_expression(_worklist.pop(), adr_node_callback); } // Bail out if there is a constant overflow. @@ -82,7 +82,7 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form() // Parse a sub-expression of the pointer, starting at the current summand. We parse the // current node, and see if it can be decomposed into further summands, or if the current // summand is terminal. -void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSummand& summand) { +void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback) { Node* n = summand.variable(); const NoOverflowInt scale = summand.scale(); const NoOverflowInt one(1); @@ -108,6 +108,7 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman Node* b = n->in((opc == Op_AddP) ? 3 : 2); _worklist.push(MemPointerSummand(a, scale)); _worklist.push(MemPointerSummand(b, scale)); + adr_node_callback.callback(n); return; } case Op_SubL: @@ -121,6 +122,7 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman _worklist.push(MemPointerSummand(a, scale)); _worklist.push(MemPointerSummand(b, sub_scale)); + adr_node_callback.callback(n); return; } case Op_MulL: @@ -155,6 +157,7 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman NoOverflowInt new_scale = scale * factor; _worklist.push(MemPointerSummand(variable, new_scale)); + adr_node_callback.callback(n); return; } case Op_CastII: @@ -173,6 +176,7 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman // Decompose: look through. Node* a = n->in(1); _worklist.push(MemPointerSummand(a, scale)); + adr_node_callback.callback(n); return; } default: diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 6f60835bdfcc1..048abc1df76e0 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -568,15 +568,30 @@ class MemPointerDecomposedFormParser : public StackObj { MemPointerDecomposedForm _decomposed_form; public: - MemPointerDecomposedFormParser(const MemNode* mem) : _mem(mem), _con(NoOverflowInt(0)) { - _decomposed_form = parse_decomposed_form(); + class Callback : public StackObj { + public: + virtual void callback(Node* n) { /* do nothing by default */ } + }; + + MemPointerDecomposedFormParser(const MemNode* mem) : + _mem(mem), _con(NoOverflowInt(0)) + { + Callback empty_callback; + _decomposed_form = parse_decomposed_form(empty_callback); + } + + MemPointerDecomposedFormParser(const MemNode* mem, Callback& adr_node_callback) : + _mem(mem), _con(NoOverflowInt(0)) + { + _decomposed_form = parse_decomposed_form(adr_node_callback); } const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } private: - MemPointerDecomposedForm parse_decomposed_form(); - void parse_sub_expression(const MemPointerSummand& summand); + MemPointerDecomposedForm parse_decomposed_form(Callback& adr_node_callback); + + void parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback); bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; }; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 3eaa1040391de..ead25a542fd81 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -48,12 +48,20 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : { } +class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerDecomposedFormParser::Callback { +private: + // TODO +public: + virtual void callback(Node* n) override { assert(false, "TODO"); } +}; + void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) { IdealLoopTree* lpt = vloop.lpt(); CountedLoopNode* cl = vloop.cl(); Node* cl_exit = vloop.cl_exit(); PhaseIdealLoop* phase = vloop.phase(); + SuperWordUnrollingAnalysisIgnoredNodes ignored_nodes; bool is_slp = true; size_t ignored_size = lpt->_body.size(); int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size); @@ -142,10 +150,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa VPointer p1(current, vloop, &nstack); have_side_effects = p1.node_stack()->is_nonempty(); - XPointer xp(current, vloop, [&] (Node* n) { - NOT_PRODUCT( n->dump(); ) - assert(false, "TODO"); - }); + XPointer xp(current, vloop, ignored_nodes); NOT_PRODUCT( xp.print_on(tty); ) } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 01e3578c14c67..deb365b68dae1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -671,6 +671,8 @@ class VLoopAnalyzer : StackObj { // TODO class XPointer : public ArenaObj { private: + typedef MemPointerDecomposedFormParser::Callback Callback; + const MemPointerDecomposedForm _decomposed_form; const jint _size; const bool _is_valid; @@ -683,7 +685,7 @@ class XPointer : public ArenaObj { _is_valid(false) {} template - XPointer(const MemNode* mem, const VLoop& vloop, Callback adr_node_callback) : + XPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), _is_valid(init_is_valid(_decomposed_form, vloop)) {} @@ -696,11 +698,10 @@ class XPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - template - static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, Callback adr_node_callback) { + static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, Callback& adr_node_callback) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; - MemPointerDecomposedFormParser parser(mem); + MemPointerDecomposedFormParser parser(mem, adr_node_callback); return parser.decomposed_form(); } From 7cdce79150cfc25afaaf5d72d5d175f20bcfb170 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 10:08:39 +0100 Subject: [PATCH 005/130] refactor ignored nodes --- src/hotspot/share/opto/superword.cpp | 91 ++++++++++++++++------------ 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index ead25a542fd81..e86953b34d378 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -48,11 +48,50 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : { } +// Collect ignored loop nodes during XPointer parsing. class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerDecomposedFormParser::Callback { private: - // TODO + const VLoop& _vloop; + const Node_List& _body; + bool* _ignored; + public: - virtual void callback(Node* n) override { assert(false, "TODO"); } + SuperWordUnrollingAnalysisIgnoredNodes(const VLoop& vloop) : + _vloop(vloop), + _body(_vloop.lpt()->_body), + _ignored(NEW_RESOURCE_ARRAY(bool, _body.size())) + { + for (uint i = 0; i < _body.size(); i++) { + _ignored[i] = false; + } + } + + virtual void callback(Node* n) override { set_ignored(n); } + + void set_ignored(uint i) { + assert(i < _body.size(), "must be in bounds"); + _ignored[i] = true; + } + + void set_ignored(Node* n) { + // Only consider nodes in the loop. + Node* ctrl = _vloop.phase()->get_ctrl(n); + if (_vloop.lpt()->is_member(_vloop.phase()->get_loop(ctrl))) { + // Find the index in the loop. + for (uint j = 0; j < _body.size(); j++) { + if (n == _body.at(j)) { + set_ignored(j); + return; + } + } + assert(false, "must find"); + } + } + + bool is_ignored(uint i) const { + assert(i < _vloop.lpt()->_body.size(), "must be in bounds"); + return _ignored[i]; + } }; void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) { @@ -61,16 +100,8 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa Node* cl_exit = vloop.cl_exit(); PhaseIdealLoop* phase = vloop.phase(); - SuperWordUnrollingAnalysisIgnoredNodes ignored_nodes; + SuperWordUnrollingAnalysisIgnoredNodes ignored_nodes(vloop); bool is_slp = true; - size_t ignored_size = lpt->_body.size(); - int *ignored_loop_nodes = NEW_RESOURCE_ARRAY(int, ignored_size); - Node_Stack nstack((int)ignored_size); - - // First clear the entries - for (uint i = 0; i < lpt->_body.size(); i++) { - ignored_loop_nodes[i] = -1; - } int max_vector = Matcher::max_vector_size_auto_vectorization(T_BYTE); @@ -85,7 +116,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa n->is_IfTrue() || n->is_CountedLoop() || (n == cl_exit)) { - ignored_loop_nodes[i] = n->_idx; + ignored_nodes.set_ignored(i); continue; } @@ -93,7 +124,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa IfNode *iff = n->as_If(); if (iff->_fcnt != COUNT_UNKNOWN && iff->_prob != PROB_UNKNOWN) { if (lpt->is_loop_exit(iff)) { - ignored_loop_nodes[i] = n->_idx; + ignored_nodes.set_ignored(i); continue; } } @@ -111,7 +142,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa // This must happen after check of phi/if if (n->is_Phi() || n->is_If()) { - ignored_loop_nodes[i] = n->_idx; + ignored_nodes.set_ignored(i); continue; } @@ -129,7 +160,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa bt = n->bottom_type()->basic_type(); } if (is_java_primitive(bt) == false) { - ignored_loop_nodes[i] = n->_idx; + ignored_nodes.set_ignored(i); continue; } @@ -141,32 +172,16 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa // save a queue of post process nodes if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) { // Process the memory expression - int stack_idx = 0; - bool have_side_effects = true; - if (adr->is_AddP() == false) { - nstack.push(adr, stack_idx++); + if (!adr->is_AddP()) { + n->dump(); + adr->dump(); + assert(false, "what is this?"); + ignored_nodes.set_ignored(adr); } else { - // Mark the components of the memory operation in nstack - VPointer p1(current, vloop, &nstack); - have_side_effects = p1.node_stack()->is_nonempty(); - + // Mark the internal nodes of the address expression in ignored_nodes. XPointer xp(current, vloop, ignored_nodes); NOT_PRODUCT( xp.print_on(tty); ) } - - // Process the pointer stack - while (have_side_effects) { - Node* pointer_node = nstack.node(); - for (uint j = 0; j < lpt->_body.size(); j++) { - Node* cur_node = lpt->_body.at(j); - if (cur_node == pointer_node) { - ignored_loop_nodes[j] = cur_node->_idx; - break; - } - } - nstack.pop(); - have_side_effects = nstack.is_nonempty(); - } } } } @@ -176,7 +191,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa // description can use bool flag_small_bt = false; for (uint i = 0; i < lpt->_body.size(); i++) { - if (ignored_loop_nodes[i] != -1) continue; + if (ignored_nodes.is_ignored(i)) continue; BasicType bt; Node* n = lpt->_body.at(i); From 51960c9d283087556fa0df9241d5cb162cb94504 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 13:13:20 +0100 Subject: [PATCH 006/130] trace flag --- src/hotspot/share/opto/superword.cpp | 1 - src/hotspot/share/opto/vectorization.hpp | 12 +++++++++--- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index e86953b34d378..ea2f7d56424de 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -180,7 +180,6 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa } else { // Mark the internal nodes of the address expression in ignored_nodes. XPointer xp(current, vloop, ignored_nodes); - NOT_PRODUCT( xp.print_on(tty); ) } } } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index deb365b68dae1..53916d01adc6f 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -688,13 +688,19 @@ class XPointer : public ArenaObj { XPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), - _is_valid(init_is_valid(_decomposed_form, vloop)) {} + _is_valid(init_is_valid(_decomposed_form, vloop)) + { +#ifndef PRODUCT + if (vloop.is_trace_pointer_analysis()) { + print_on(tty); + mem->dump(); + } +#endif + } // Accessors bool is_valid() const { return _is_valid; } - // TODO - // if (vloop.is_trace_pointer_analysis()) { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: From 5fb21ae9a5e322b489d3f775256db7e265cd02d1 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 13:34:39 +0100 Subject: [PATCH 007/130] fix dump for product --- src/hotspot/share/opto/superword.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index ea2f7d56424de..f804194c6df41 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -173,8 +173,8 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) { // Process the memory expression if (!adr->is_AddP()) { - n->dump(); - adr->dump(); + NOT_PRODUCT( n->dump(); ) + NOT_PRODUCT( adr->dump(); ) assert(false, "what is this?"); ignored_nodes.set_ignored(adr); } else { From b12ce13b9892469797dbf3b2982734b001e5303f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 14:43:51 +0100 Subject: [PATCH 008/130] find base --- src/hotspot/share/opto/mempointer.cpp | 6 +++ src/hotspot/share/opto/mempointer.hpp | 58 +++++++++++++++++++----- src/hotspot/share/opto/vectorization.cpp | 4 +- src/hotspot/share/opto/vectorization.hpp | 6 +++ 4 files changed, 62 insertions(+), 12 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 764a2e144af80..0252b60543ef3 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -24,6 +24,7 @@ #include "precompiled.hpp" #include "opto/mempointer.hpp" +#include "opto/addnode.hpp" #include "utilities/resourceHash.hpp" // Recursively parse the pointer expression with a DFS all-path traversal @@ -302,6 +303,11 @@ bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, cons #endif } +MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::from_AddP(Node* pointer) { + AddPNode* adr = pointer->isa_AddP(); + return (adr == nullptr) ? Base() : Base(true, adr->in(AddPNode::Base)); +} + // Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to // prove that the computed aliasing also applies for the underlying pointers. Note that the // condition (S0) is already given, because the MemPointerDecomposedForm is always constructed diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 048abc1df76e0..ea569e542f359 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -479,25 +479,58 @@ class MemPointerDecomposedForm : public StackObj { // variables. It should be rare that we have more than 9 variables. static const int SUMMANDS_SIZE = 10; -private: - Node* _pointer; // pointer node associated with this (sub)pointer + class Base : public StackObj { + private: + bool _is_known; + Node* _base; + + Base(bool is_known, Node* base) : _is_known(is_known), _base(base) {} + + public: + Base() : Base(false, nullptr) {} + + static Base from_AddP(Node* pointer); + bool is_known() const { return _is_known; } + Node* get() const { assert(is_known(), "must be"); return _base; } + +#ifndef PRODUCT + void print_on(outputStream* st) const { + if (_is_known) { + if (_base == nullptr) { + tty->print("native"); + } else { + tty->print("%d %s", _base->_idx, _base->Name()); + } + } else { + tty->print("unknown"); + } + } +#endif + }; +private: MemPointerSummand _summands[SUMMANDS_SIZE]; NoOverflowInt _con; + Base _base; public: // Empty - MemPointerDecomposedForm() : _pointer(nullptr), _con(NoOverflowInt::make_NaN()) {} + MemPointerDecomposedForm() : _con(NoOverflowInt::make_NaN()) {} private: // Default / trivial: pointer = 0 + 1 * pointer - MemPointerDecomposedForm(Node* pointer) : _pointer(pointer), _con(NoOverflowInt(0)) { + MemPointerDecomposedForm(Node* pointer) : + _con(NoOverflowInt(0)), + _base(Base::from_AddP(pointer)) + { assert(pointer != nullptr, "pointer must be non-null"); _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1)); } - MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) - : _pointer(pointer), _con(con) { + MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) : + _con(con), + _base(Base::from_AddP(pointer)) + { assert(!_con.is_NaN(), "non-NaN constant"); assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); for (int i = 0; i < summands.length(); i++) { @@ -530,9 +563,14 @@ class MemPointerDecomposedForm : public StackObj { } const NoOverflowInt con() const { return _con; } + const Base& base() const { return _base; } #ifndef PRODUCT void print_form_on(outputStream* st) const { + if (_con.is_NaN()) { + st->print_cr("empty"); + return; + } _con.print_on(st); for (int i = 0; i < SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _summands[i]; @@ -544,11 +582,9 @@ class MemPointerDecomposedForm : public StackObj { } void print_on(outputStream* st) const { - if (_pointer == nullptr) { - st->print_cr("MemPointerDecomposedForm empty."); - return; - } - st->print("MemPointerDecomposedForm[%d %s: form = ", _pointer->_idx, _pointer->Name()); + st->print("MemPointerDecomposedForm[base: "); + _base.print_on(st); + st->print(", form: "); print_form_on(st); st->print_cr("]"); } diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index aec4651727372..cec7eefce9eb7 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -477,7 +477,9 @@ void XPointer::print_on(outputStream* st) const { return; } - st->print("size = %2d, form = ", _size); + st->print("size: %2d, base: ", _size); + _decomposed_form.base().print_on(st); + st->print(", form: "); _decomposed_form.print_form_on(st); st->print_cr("]"); } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 53916d01adc6f..13a45eeb71c95 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -714,6 +714,12 @@ class XPointer : public ArenaObj { // Check that all variables are either the iv, or else invariants. // TODO why pre-loop static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { + if (!decomposed_form.base().is_known()) { + // XPointer needs to know if it is native (off-heap) or object (on-heap). + assert(false, "TODO find me!"); + return false; + } + for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = decomposed_form.summands_at(i); Node* variable = summand.variable(); From 155dbba357dd83b2a4c5171913f7bfa6f1281b1d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 15:04:54 +0100 Subject: [PATCH 009/130] xpointer pulled in to SuperWord --- src/hotspot/share/opto/vectorization.cpp | 17 +++++++++++++++++ src/hotspot/share/opto/vectorization.hpp | 4 ++++ 2 files changed, 21 insertions(+) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index cec7eefce9eb7..b1083de214d76 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -197,6 +197,8 @@ void VLoopVPointers::count_vpointers() { void VLoopVPointers::allocate_vpointers_array() { uint bytes = _vpointers_length * sizeof(VPointer); _vpointers = (VPointer*)_arena->Amalloc(bytes); + uint bytes2 = _vpointers_length * sizeof(XPointer); + _xpointers = (XPointer*)_arena->Amalloc(bytes2); } void VLoopVPointers::compute_and_cache_vpointers() { @@ -204,6 +206,9 @@ void VLoopVPointers::compute_and_cache_vpointers() { _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); + + MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + ::new (&_xpointers[pointers_idx]) XPointer(mem, _vloop, empty_callback); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; }); @@ -217,6 +222,14 @@ const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const { return _vpointers[pointers_idx]; } +const XPointer& VLoopVPointers::xpointer(const MemNode* mem) const { + assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop"); + int bb_idx = _body.bb_idx(mem); + int pointers_idx = _bb_idx_to_vpointer.at(bb_idx); + assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range"); + return _xpointers[pointers_idx]; +} + #ifndef PRODUCT void VLoopVPointers::print() const { tty->print_cr("\nVLoopVPointers::print:"); @@ -225,6 +238,10 @@ void VLoopVPointers::print() const { const VPointer& p = vpointer(mem); tty->print(" "); p.print(); + + const XPointer& xp = xpointer(mem); + tty->print(" "); + xp.print_on(tty); }); } #endif diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 13a45eeb71c95..ca0bf42d744ba 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -35,6 +35,7 @@ // utilities for C2's loop auto-vectorization. class VPointer; +class XPointer; class VStatus : public StackObj { private: @@ -475,6 +476,7 @@ class VLoopVPointers : public StackObj { // Array of cached pointers VPointer* _vpointers; + XPointer* _xpointers; int _vpointers_length; // Map bb_idx -> index in _vpointers. -1 if not mapped. @@ -488,6 +490,7 @@ class VLoopVPointers : public StackObj { _vloop(vloop), _body(body), _vpointers(nullptr), + _xpointers(nullptr), _bb_idx_to_vpointer(arena, vloop.estimated_body_length(), vloop.estimated_body_length(), @@ -496,6 +499,7 @@ class VLoopVPointers : public StackObj { void compute_vpointers(); const VPointer& vpointer(const MemNode* mem) const; + const XPointer& xpointer(const MemNode* mem) const; NOT_PRODUCT( void print() const; ) private: From e6d76834bcd56e09a58fa50092f0ffb2bba99d96 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 15:33:44 +0100 Subject: [PATCH 010/130] first steps in find adjacent - but we need to do dependency graph first now --- src/hotspot/share/opto/superword.cpp | 17 +++++++++-------- src/hotspot/share/opto/superword.hpp | 14 +++++++++++++- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f804194c6df41..93a3420307b90 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -505,9 +505,9 @@ bool SuperWord::SLP_extract() { // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization. void SuperWord::create_adjacent_memop_pairs() { ResourceMark rm; - GrowableArray vpointers; + GrowableArray memops; - collect_valid_vpointers(vpointers); + collect_valid_memops(memops); // Sort the VPointers. This does 2 things: // - Separate the VPointer into groups: all memops that have the same opcode and the same @@ -516,7 +516,8 @@ void SuperWord::create_adjacent_memop_pairs() { // if they are in the same group. This decreases the work. // - Sort by offset inside the groups. This decreases the work needed to determine adjacent // memops inside a group. - vpointers.sort(VPointer::cmp_for_sort); + assert(false, "TODO"); + //vpointers.sort(VPointer::cmp_for_sort); #ifndef PRODUCT if (is_trace_superword_adjacent_memops()) { @@ -524,7 +525,7 @@ void SuperWord::create_adjacent_memop_pairs() { } #endif - create_adjacent_memop_pairs_in_all_groups(vpointers); + //create_adjacent_memop_pairs_in_all_groups(vpointers); #ifndef PRODUCT if (is_trace_superword_packset()) { @@ -535,13 +536,13 @@ void SuperWord::create_adjacent_memop_pairs() { } // Collect all memops vpointers that could potentially be vectorized. -void SuperWord::collect_valid_vpointers(GrowableArray& vpointers) { +void SuperWord::collect_valid_memops(GrowableArray& memops) { for_each_mem([&] (const MemNode* mem, int bb_idx) { - const VPointer& p = vpointer(mem); - if (p.valid() && + const XPointer& p = xpointer(mem); + if (p.is_valid() && !mem->is_LoadStore() && is_java_primitive(mem->memory_type())) { - vpointers.append(&p); + memops.append(MemOp(mem, &p)); } }); } diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 8b24e0cf3a11a..41a897a15ea67 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -503,6 +503,9 @@ class SuperWord : public ResourceObj { const VPointer& vpointer(const MemNode* mem) const { return _vloop_analyzer.vpointers().vpointer(mem); } + const XPointer& xpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().xpointer(mem); + } #ifndef PRODUCT // TraceAutoVectorization and TraceSuperWord @@ -563,8 +566,17 @@ class SuperWord : public ResourceObj { bool SLP_extract(); // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization. + class MemOp : public StackObj { + private: + const MemNode* _mem; + const XPointer* _xpointer; + public: + // Empty, for GrowableArray + MemOp() : _mem(nullptr), _xpointer(nullptr) {} + MemOp(const MemNode* mem, const XPointer* xpointer) : _mem(mem), _xpointer(xpointer) {} + }; void create_adjacent_memop_pairs(); - void collect_valid_vpointers(GrowableArray& vpointers); + void collect_valid_memops(GrowableArray& memops); void create_adjacent_memop_pairs_in_all_groups(const GrowableArray& vpointers); static int find_group_end(const GrowableArray& vpointers, int group_start); void create_adjacent_memop_pairs_in_one_group(const GrowableArray& vpointers, const int group_start, int group_end); From 1b61023f6d1a1fe439422cfa7e26bd7e4ac41336 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 16:08:35 +0100 Subject: [PATCH 011/130] stub for never_overlaps_with, now need trace --- src/hotspot/share/opto/vectorization.cpp | 14 +++++++++++--- src/hotspot/share/opto/vectorization.hpp | 6 ++++++ 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index b1083de214d76..8a0ff9a0954b2 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -273,7 +273,7 @@ void VLoopDependencyGraph::construct() { MemNode* n1 = slice_nodes.at(j); memory_pred_edges.clear(); - const VPointer& p1 = _vpointers.vpointer(n1); + const XPointer& p1 = _vpointers.xpointer(n1); // For all memory nodes before it, check if we need to add a memory edge. for (int k = slice_nodes.length() - 1; k > j; k--) { MemNode* n2 = slice_nodes.at(k); @@ -281,8 +281,8 @@ void VLoopDependencyGraph::construct() { // Ignore Load-Load dependencies: if (n1->is_Load() && n2->is_Load()) { continue; } - const VPointer& p2 = _vpointers.vpointer(n2); - if (!VPointer::not_equal(p1.cmp(p2))) { + const XPointer& p2 = _vpointers.xpointer(n2); + if (!p1.never_overlaps_with(p2)) { // Possibly overlapping memory memory_pred_edges.append(_body.bb_idx(n2)); } @@ -485,6 +485,14 @@ int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) { return 0; // equal } +bool XPointer::never_overlaps_with(const XPointer& other) const { + const MemPointerDecomposedForm& s1 = decomposed_form(); + const MemPointerDecomposedForm& s2 = other.decomposed_form(); + //const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _trace )); + // TODO + return false; +} + #ifndef PRODUCT void XPointer::print_on(outputStream* st) const { st->print("XPointer["); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index ca0bf42d744ba..26d47cb00e6ef 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -681,6 +681,8 @@ class XPointer : public ArenaObj { const jint _size; const bool _is_valid; + //NOT_PRODUCT( const TraceMemPointer& _trace; ) + public: // Default constructor, e.g. for GrowableArray. XPointer() : @@ -704,6 +706,10 @@ class XPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } + const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } + + // Aliasing + bool never_overlaps_with(const XPointer& other) const; NOT_PRODUCT( void print_on(outputStream* st) const; ) From c79f7fc8758e95909bddee5974a1074d1e3fb81e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 7 Nov 2024 16:57:14 +0100 Subject: [PATCH 012/130] tracing wired in --- src/hotspot/share/opto/memnode.cpp | 3 +- src/hotspot/share/opto/mempointer.hpp | 9 ++++-- .../share/opto/traceAutoVectorizationTag.hpp | 5 ++- src/hotspot/share/opto/vectorization.cpp | 21 ++++++++++--- src/hotspot/share/opto/vectorization.hpp | 31 ++++++++++++------- 5 files changed, 50 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 919d23fea8da5..a46d8ed0ac25c 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2933,7 +2933,8 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St #ifndef PRODUCT const TraceMemPointer trace(is_trace_pointer(), is_trace_aliasing(), - is_trace_adjacency()); + is_trace_adjacency(), + true); #endif const MemPointer pointer_use(use_store NOT_PRODUCT( COMMA trace )); const MemPointer pointer_def(def_store NOT_PRODUCT( COMMA trace )); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index ea569e542f359..932e65431d031 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -345,22 +345,27 @@ #ifndef PRODUCT class TraceMemPointer : public StackObj { private: + // TODO rename and possibly extend, also rename tags const bool _is_trace_pointer; const bool _is_trace_aliasing; const bool _is_trace_adjacency; + const bool _is_trace_overlap; public: TraceMemPointer(const bool is_trace_pointer, const bool is_trace_aliasing, - const bool is_trace_adjacency) : + const bool is_trace_adjacency, + const bool is_trace_overlap) : _is_trace_pointer( is_trace_pointer), _is_trace_aliasing( is_trace_aliasing), - _is_trace_adjacency(is_trace_adjacency) + _is_trace_adjacency(is_trace_adjacency), + _is_trace_overlap(is_trace_overlap) {} bool is_trace_pointer() const { return _is_trace_pointer; } bool is_trace_aliasing() const { return _is_trace_aliasing; } bool is_trace_adjacency() const { return _is_trace_adjacency; } + bool is_trace_overlap() const { return _is_trace_overlap; } }; #endif diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 038e04fe0c50b..2c1fe6efb0de1 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -29,7 +29,10 @@ #include "utilities/stringUtils.hpp" #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \ - flags(POINTER_ANALYSIS, "Trace VPointer (verbose)") \ + flags(POINTER, "Trace VPointer construction / parsing") \ + flags(ALIASING, "Trace VPointer aliasing") \ + flags(ADJACENCY, "Trace VPointer adjacency") \ + flags(OVERLAP, "Trace VPointer overlap") \ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 8a0ff9a0954b2..ff042713b752f 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -282,7 +282,7 @@ void VLoopDependencyGraph::construct() { if (n1->is_Load() && n2->is_Load()) { continue; } const XPointer& p2 = _vpointers.xpointer(n2); - if (!p1.never_overlaps_with(p2)) { + if (!p1.never_overlaps_with(p2, _vloop)) { // Possibly overlapping memory memory_pred_edges.append(_body.bb_idx(n2)); } @@ -485,12 +485,25 @@ int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) { return 0; // equal } -bool XPointer::never_overlaps_with(const XPointer& other) const { +bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) const { const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); - //const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _trace )); + const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); + // TODO - return false; + bool is_never_overlap = false; + +#ifndef PRODUCT + if (vloop.mptrace().is_trace_overlap()) { + tty->print("Never Overlap: %s, aliasing: ", is_never_overlap ? "true" : "false"); + //tty->print("Never Overlap: %s, because size = %d and aliasing = ", + // is_adjacent ? "true" : "false", size); + aliasing.print_on(tty); + tty->cr(); + } +#endif + + return is_never_overlap; } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 26d47cb00e6ef..7021588cc9954 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -87,6 +87,7 @@ class VLoop : public StackObj { CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only NOT_PRODUCT(VTrace _vtrace;) + NOT_PRODUCT(TraceMemPointer _mptrace; ) static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized"; static constexpr char const* FAILURE_UNROLL_ONLY = "loop only wants to be unrolled"; @@ -104,7 +105,18 @@ class VLoop : public StackObj { _cl (nullptr), _cl_exit (nullptr), _iv (nullptr), - _pre_loop_end (nullptr) {} + _pre_loop_end (nullptr) +#ifndef PRODUCT + COMMA + _mptrace(TraceMemPointer( + _vtrace.is_trace(TraceAutoVectorizationTag::POINTER), + _vtrace.is_trace(TraceAutoVectorizationTag::ALIASING), + _vtrace.is_trace(TraceAutoVectorizationTag::ADJACENCY), + _vtrace.is_trace(TraceAutoVectorizationTag::OVERLAP) + )) +#endif + {} + NONCOPYABLE(VLoop); IdealLoopTree* lpt() const { return _lpt; }; @@ -135,7 +147,8 @@ class VLoop : public StackObj { static bool vectors_should_be_aligned() { return !Matcher::misaligned_vectors_ok() || AlignVector; } #ifndef PRODUCT - const VTrace& vtrace() const { return _vtrace; } + const VTrace& vtrace() const { return _vtrace; } + const TraceMemPointer& mptrace() const { return _mptrace; } bool is_trace_preconditions() const { return _vtrace.is_trace(TraceAutoVectorizationTag::PRECONDITIONS); @@ -164,10 +177,6 @@ class VLoop : public StackObj { bool is_trace_vpointers() const { return _vtrace.is_trace(TraceAutoVectorizationTag::POINTERS); } - - bool is_trace_pointer_analysis() const { - return _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ANALYSIS); - } #endif // Is the node in the basic block of the loop? @@ -681,8 +690,6 @@ class XPointer : public ArenaObj { const jint _size; const bool _is_valid; - //NOT_PRODUCT( const TraceMemPointer& _trace; ) - public: // Default constructor, e.g. for GrowableArray. XPointer() : @@ -697,9 +704,11 @@ class XPointer : public ArenaObj { _is_valid(init_is_valid(_decomposed_form, vloop)) { #ifndef PRODUCT - if (vloop.is_trace_pointer_analysis()) { + if (vloop.mptrace().is_trace_pointer()) { + tty->print_cr("XPointer::XPointer:"); + tty->print("mem: "); mem->dump(); print_on(tty); - mem->dump(); + mem->in(MemNode::Address)->dump_bfs(7, 0, "d"); } #endif } @@ -709,7 +718,7 @@ class XPointer : public ArenaObj { const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } // Aliasing - bool never_overlaps_with(const XPointer& other) const; + bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; NOT_PRODUCT( void print_on(outputStream* st) const; ) From f336d84d1ceb666c613251af2269d25c0c75cb65 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 08:21:33 +0100 Subject: [PATCH 013/130] is_never_in_distance_range --- src/hotspot/share/opto/mempointer.hpp | 7 +++++++ src/hotspot/share/opto/vectorization.cpp | 19 ++++++++++++++----- src/hotspot/share/opto/vectorization.hpp | 1 + 3 files changed, 22 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 932e65431d031..2f946fb073707 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -405,6 +405,13 @@ class MemPointerAliasing { return _aliasing == Always && _distance == distance; } + // Use case: overlap. + // Note: the bounds are exclusive: lo < element < hi + bool is_never_in_distance_range(const jint distance_lo, const jint distance_hi) const { + return _aliasing == Always && + (_distance <= distance_lo || distance_hi <= _distance); + } + #ifndef PRODUCT void print_on(outputStream* st) const { switch(_aliasing) { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index ff042713b752f..962a60c960ed3 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -490,14 +490,23 @@ bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) co const MemPointerDecomposedForm& s2 = other.decomposed_form(); const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); - // TODO - bool is_never_overlap = false; + // The aliasing tries to compute: + // distance = s2 - s1 + // + // We know that we have no overlap if we can prove: + // s1 >= s2 + s2_size || s1 + s1_size <= s2 + // + // Which we can restate as: + // distance <= -s2_size || s1_size <= distance + // + const jint distance_lo = -other.size(); + const jint distance_hi = size(); + bool is_never_overlap = aliasing.is_never_in_distance_range(distance_lo, distance_hi); #ifndef PRODUCT if (vloop.mptrace().is_trace_overlap()) { - tty->print("Never Overlap: %s, aliasing: ", is_never_overlap ? "true" : "false"); - //tty->print("Never Overlap: %s, because size = %d and aliasing = ", - // is_adjacent ? "true" : "false", size); + tty->print("Never Overlap: %s, distance_lo: %d, distance_hi: %d, aliasing: ", + is_never_overlap ? "true" : "false", distance_lo, distance_hi); aliasing.print_on(tty); tty->cr(); } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 7021588cc9954..43a9ce7894490 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -716,6 +716,7 @@ class XPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } + jint size() const { return _size; } // Aliasing bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; From 92d1b12a6fc99998c24e494fe5ab0b89b8a4abd0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 08:56:00 +0100 Subject: [PATCH 014/130] rename and introduce definition of NotOrAtDistance --- src/hotspot/share/opto/mempointer.cpp | 2 +- src/hotspot/share/opto/mempointer.hpp | 35 +++++++++++++++++---------- 2 files changed, 23 insertions(+), 14 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 0252b60543ef3..27b7e6c4cbce9 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -370,7 +370,7 @@ MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerD tty->print_cr(" -> Aliasing always, distance = %d.", distance.value()); } #endif - return MemPointerAliasing::make_always(distance.value()); + return MemPointerAliasing::make_always_at_distance(distance.value()); } bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 2f946fb073707..64ed0287aba5f 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -373,13 +373,21 @@ class TraceMemPointer : public StackObj { class MemPointerAliasing { public: enum Aliasing { - Unknown, // Distance unknown. - // Example: two "int[]" with different variable index offsets. - // e.g. "array[i] vs array[j]". - // e.g. "array1[i] vs array2[j]". - Always}; // Constant distance = p1 - p2. - // Example: The same address expression, except for a constant offset - // e.g. "array[i] vs array[i+1]". + Unknown, // Distance unknown. + // Example: two "int[]" (unknown if the same) with different variable index offsets: + // e.g. "array[i] vs array[j]". + // e.g. "array1[i] vs array2[j]". + AlwaysAtDistance, // Constant distance = p2 - p1. + // Example: The same address expression, except for a constant offset: + // e.g. "array[i] vs array[i+1]". + NotOrAtDistance}; // At compile-time, we know that at run-time it is either of these: + // (1) Not: The pointers belong to different memory objects. + // (2) AtConstDistance: distance = p2 - p1. + // Example: two "int[]" (unknown if the same) with indices that only differ by a + // constant offset: + // e.g. "array1[i] vs array2[i+4]": + // if "array1 == array2": distance = 4. + // if "array1 != array2": different memory objects. private: const Aliasing _aliasing; const jint _distance; @@ -396,27 +404,28 @@ class MemPointerAliasing { return MemPointerAliasing(Unknown, 0); } - static MemPointerAliasing make_always(const jint distance) { - return MemPointerAliasing(Always, distance); + static MemPointerAliasing make_always_at_distance(const jint distance) { + return MemPointerAliasing(AlwaysAtDistance, distance); } // Use case: exact aliasing and adjacency. bool is_always_at_distance(const jint distance) const { - return _aliasing == Always && _distance == distance; + return _aliasing == AlwaysAtDistance && _distance == distance; } // Use case: overlap. // Note: the bounds are exclusive: lo < element < hi bool is_never_in_distance_range(const jint distance_lo, const jint distance_hi) const { - return _aliasing == Always && + return _aliasing == AlwaysAtDistance && (_distance <= distance_lo || distance_hi <= _distance); } #ifndef PRODUCT void print_on(outputStream* st) const { switch(_aliasing) { - case Unknown: st->print("Unknown"); break; - case Always: st->print("Always(%d)", _distance); break; + case Unknown: st->print("Unknown"); break; + case AlwaysAtDistance: st->print("AlwaysAtDistance(%d)", _distance); break; + case NotOrAtDistance: st->print("NotOrAtDistance(%d)", _distance); break; default: ShouldNotReachHere(); } } From 50215fdc2bfcc2726576424577111bf11328d02c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 10:31:32 +0100 Subject: [PATCH 015/130] implement make_not_or_at_distance and has_different_base_but_otherwise_same_summands_as --- src/hotspot/share/opto/mempointer.cpp | 123 +++++++++++++++++++++----- src/hotspot/share/opto/mempointer.hpp | 17 +++- 2 files changed, 114 insertions(+), 26 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 27b7e6c4cbce9..8dec431b64234 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -328,17 +328,22 @@ MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerD #endif // "MemPointer Lemma" condition (S2): check if all summands are the same: - for (uint i = 0; i < SUMMANDS_SIZE; i++) { - const MemPointerSummand s1 = summands_at(i); - const MemPointerSummand s2 = other.summands_at(i); - if (s1 != s2) { + bool has_same_base = false; + if (has_different_base_but_otherwise_same_summands_as(other)) { + // At runtime, the two object bases can be: + // (1) different: we have no aliasing, pointers point to different memory objects. + // (2) the same: implies that all summands are the same, (S2) holds. + has_same_base = false; + } else if (has_same_summands_as(other)) { + // (S2) holds. If all summands are the same, also the base must be the same. + has_same_base = true; + } else { #ifndef PRODUCT - if (trace.is_trace_aliasing()) { - tty->print_cr(" -> Aliasing unknown, differ on summand %d.", i); - } -#endif - return MemPointerAliasing::make_unknown(); + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing unknown, summands are not the same."); } +#endif + return MemPointerAliasing::make_unknown(); } // "MemPointer Lemma" condition (S3): check that the constants do not differ too much: @@ -356,21 +361,95 @@ MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerD return MemPointerAliasing::make_unknown(); } - // "MemPointer Lemma" condition (S1): - // Given that all summands are the same, we know that both pointers point into the - // same memory object. With the Pre-Condition, we know that both pointers are in - // bounds of that same memory object. - - // Hence, all 4 conditions of the "MemoryPointer Lemma" are established, and hence - // we know that the distance between the underlying pointers is equal to the distance - // we computed for the MemPointers: - // p_other - p_this = distance = other.con - this.con + if (has_same_base) { + // "MemPointer Lemma" condition (S1): + // Given that all summands are the same, we know that both pointers point into the + // same memory object. With the Pre-Condition, we know that both pointers are in + // bounds of that same memory object. + // + // Hence, all 4 conditions of the "MemPointer Lemma" are established, and hence + // we know that the distance between the underlying pointers is equal to the distance + // we computed for the MemPointers: + // p_other - p_this = distance = other.con - this.con #ifndef PRODUCT - if (trace.is_trace_aliasing()) { - tty->print_cr(" -> Aliasing always, distance = %d.", distance.value()); - } + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing always at distance = %d.", distance.value()); + } +#endif + return MemPointerAliasing::make_always_at_distance(distance.value()); + } else { + // At runtime, the two object bases can be: + // (1) different: pointers do not alias. + // (2) the same: implies that (S2) holds. The summands are all the same, and with + // the Pre-Condition, we know that both pointers are in bounds of the + // same memory object, i.e. (S1) holds. We have already proven (S0) + // and (S3), so all 4 conditions for "MemPointer Lemma" are given. +#ifndef PRODUCT + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing not or at distance = %d.", distance.value()); + } #endif - return MemPointerAliasing::make_always_at_distance(distance.value()); + return MemPointerAliasing::make_not_or_at_distance(distance.value()); + } +} + +bool MemPointerDecomposedForm::has_same_summands_as(const MemPointerDecomposedForm& other) const { + for (uint i = 0; i < SUMMANDS_SIZE; i++) { + if (summands_at(i) != other.summands_at(i)) { return false; } + } + return true; +} + +bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const { + if (!base().is_object() || + !other.base().is_object() || + base().get() == other.base().get()) { + // The base is the same, or we do not know if the base is different. + return false; + } + const MemPointerSummand base1(base().get(), NoOverflowInt(1)); + const MemPointerSummand base2(other.base().get(), NoOverflowInt(1)); + bool found_base1 = false; + bool found_base2 = false; + + uint i1 = 0; + uint i2 = 0; + while (i1 < SUMMANDS_SIZE || i2 < SUMMANDS_SIZE) { + // Handle bases. + if (i1 < SUMMANDS_SIZE && summands_at(i1) == base1) { + assert(!found_base1, "can only find once"); + found_base1 = true; + i1++; + continue; + } + if (i2 < SUMMANDS_SIZE && other.summands_at(i2) == base2) { + assert(!found_base2, "can only find once"); + found_base2 = true; + i2++; + continue; + } + // Handle empty summands. + if (i1 < SUMMANDS_SIZE && summands_at(i1).variable() == nullptr) { + i1++; + continue; + } + if (i2 < SUMMANDS_SIZE && other.summands_at(i2).variable() == nullptr) { + i2++; + continue; + } + // Handle other summands. + if (i1 < SUMMANDS_SIZE && i2 < SUMMANDS_SIZE && + summands_at(i1) == other.summands_at(i2)) { + i1++; + i2++; + continue; + } + // There is a difference in the summands, other than the bases. + return false; + } + assert(i1 == SUMMANDS_SIZE && i2 == SUMMANDS_SIZE, "scanned all"); + // Check if we found both bases - the other summands are all the same. + return found_base1 && found_base2; } bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 64ed0287aba5f..54ddc9e708e03 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -371,7 +371,7 @@ class TraceMemPointer : public StackObj { // Class to represent aliasing between two MemPointer. class MemPointerAliasing { -public: +private: enum Aliasing { Unknown, // Distance unknown. // Example: two "int[]" (unknown if the same) with different variable index offsets: @@ -381,14 +381,13 @@ class MemPointerAliasing { // Example: The same address expression, except for a constant offset: // e.g. "array[i] vs array[i+1]". NotOrAtDistance}; // At compile-time, we know that at run-time it is either of these: - // (1) Not: The pointers belong to different memory objects. + // (1) Not: The pointers belong to different memory objects. Distance unknown. // (2) AtConstDistance: distance = p2 - p1. // Example: two "int[]" (unknown if the same) with indices that only differ by a // constant offset: // e.g. "array1[i] vs array2[i+4]": // if "array1 == array2": distance = 4. // if "array1 != array2": different memory objects. -private: const Aliasing _aliasing; const jint _distance; @@ -408,6 +407,10 @@ class MemPointerAliasing { return MemPointerAliasing(AlwaysAtDistance, distance); } + static MemPointerAliasing make_not_or_at_distance(const jint distance) { + return MemPointerAliasing(NotOrAtDistance, distance); + } + // Use case: exact aliasing and adjacency. bool is_always_at_distance(const jint distance) const { return _aliasing == AlwaysAtDistance && _distance == distance; @@ -416,7 +419,7 @@ class MemPointerAliasing { // Use case: overlap. // Note: the bounds are exclusive: lo < element < hi bool is_never_in_distance_range(const jint distance_lo, const jint distance_hi) const { - return _aliasing == AlwaysAtDistance && + return (_aliasing == AlwaysAtDistance || _aliasing == NotOrAtDistance) && (_distance <= distance_lo || distance_hi <= _distance); } @@ -513,6 +516,7 @@ class MemPointerDecomposedForm : public StackObj { static Base from_AddP(Node* pointer); bool is_known() const { return _is_known; } Node* get() const { assert(is_known(), "must be"); return _base; } + bool is_object() const { return _is_known && _base != nullptr; } #ifndef PRODUCT void print_on(outputStream* st) const { @@ -578,6 +582,11 @@ class MemPointerDecomposedForm : public StackObj { MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const; +private: + bool has_same_summands_as(const MemPointerDecomposedForm& other) const; + bool has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const; + +public: const MemPointerSummand& summands_at(const uint i) const { assert(i < SUMMANDS_SIZE, "in bounds"); return _summands[i]; From 0c4a0ec62d6c4b8da384cd8fea13f0f66c71b6c9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 11:16:02 +0100 Subject: [PATCH 016/130] simplify base check, by moving it to the 0th summand --- src/hotspot/share/opto/mempointer.cpp | 52 +++++---------------------- src/hotspot/share/opto/mempointer.hpp | 33 +++++++++++++++-- 2 files changed, 39 insertions(+), 46 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 8dec431b64234..9430fe5b34cd5 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -393,8 +393,8 @@ MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerD } } -bool MemPointerDecomposedForm::has_same_summands_as(const MemPointerDecomposedForm& other) const { - for (uint i = 0; i < SUMMANDS_SIZE; i++) { +bool MemPointerDecomposedForm::has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const { + for (uint i = start; i < SUMMANDS_SIZE; i++) { if (summands_at(i) != other.summands_at(i)) { return false; } } return true; @@ -407,49 +407,15 @@ bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as // The base is the same, or we do not know if the base is different. return false; } + +#ifdef ASSERT const MemPointerSummand base1(base().get(), NoOverflowInt(1)); const MemPointerSummand base2(other.base().get(), NoOverflowInt(1)); - bool found_base1 = false; - bool found_base2 = false; - - uint i1 = 0; - uint i2 = 0; - while (i1 < SUMMANDS_SIZE || i2 < SUMMANDS_SIZE) { - // Handle bases. - if (i1 < SUMMANDS_SIZE && summands_at(i1) == base1) { - assert(!found_base1, "can only find once"); - found_base1 = true; - i1++; - continue; - } - if (i2 < SUMMANDS_SIZE && other.summands_at(i2) == base2) { - assert(!found_base2, "can only find once"); - found_base2 = true; - i2++; - continue; - } - // Handle empty summands. - if (i1 < SUMMANDS_SIZE && summands_at(i1).variable() == nullptr) { - i1++; - continue; - } - if (i2 < SUMMANDS_SIZE && other.summands_at(i2).variable() == nullptr) { - i2++; - continue; - } - // Handle other summands. - if (i1 < SUMMANDS_SIZE && i2 < SUMMANDS_SIZE && - summands_at(i1) == other.summands_at(i2)) { - i1++; - i2++; - continue; - } - // There is a difference in the summands, other than the bases. - return false; - } - assert(i1 == SUMMANDS_SIZE && i2 == SUMMANDS_SIZE, "scanned all"); - // Check if we found both bases - the other summands are all the same. - return found_base1 && found_base2; + assert(summands_at(0) == base1 && other.summands_at(0) == base2, "bases in 0th element"); +#endif + + // Check if all other summands are the same. + return has_same_summands_as(other, 1); } bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 54ddc9e708e03..da6c02f113ab8 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -495,6 +495,8 @@ class MemPointerSummand : public StackObj { // // pointer = SUM(summands) + con // +// Note: if the base is known to be an object reference (base().is_object()), then +// the base is in the 0th summand. class MemPointerDecomposedForm : public StackObj { public: // We limit the number of summands to 10. This is just a best guess, and not at this @@ -546,7 +548,7 @@ class MemPointerDecomposedForm : public StackObj { // Default / trivial: pointer = 0 + 1 * pointer MemPointerDecomposedForm(Node* pointer) : _con(NoOverflowInt(0)), - _base(Base::from_AddP(pointer)) + _base(Base()) { assert(pointer != nullptr, "pointer must be non-null"); _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1)); @@ -558,10 +560,34 @@ class MemPointerDecomposedForm : public StackObj { { assert(!_con.is_NaN(), "non-NaN constant"); assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); +#ifdef ASSERT for (int i = 0; i < summands.length(); i++) { - MemPointerSummand s = summands.at(i); + const MemPointerSummand& s = summands.at(i); assert(s.variable() != nullptr, "variable cannot be null"); assert(!s.scale().is_NaN(), "non-NaN scale"); + } +#endif + + if (_base.is_object()) { + MemPointerSummand b(_base.get(), NoOverflowInt(1)); + if (summands.contains(b)) { + // We have a known base object, move it to the 0th summand. + _summands[0] = b; + int pos = 1; + for (int i = 0; i < summands.length(); i++) { + if (summands.at(i) == b) { continue; } + _summands[pos++] = summands.at(i); + } + return; + } else { + // We did not find the base object, reset to unknown base. + assert(false, "we should always find the base"); + _base = Base(); + } + } + + for (int i = 0; i < summands.length(); i++) { + const MemPointerSummand& s = summands.at(i); _summands[i] = s; } } @@ -583,7 +609,8 @@ class MemPointerDecomposedForm : public StackObj { NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const; private: - bool has_same_summands_as(const MemPointerDecomposedForm& other) const; + bool has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const; + bool has_same_summands_as(const MemPointerDecomposedForm& other) const { return has_same_summands_as(other, 0); }; bool has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const; public: From 864fba018ff14b4bd4126167956617f58a42aa95 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 12:46:20 +0100 Subject: [PATCH 017/130] refactor of create adjacent pairs, though body is missing --- src/hotspot/share/opto/mempointer.hpp | 29 +++- src/hotspot/share/opto/noOverflowInt.hpp | 11 ++ src/hotspot/share/opto/superword.cpp | 167 +++++++++++++---------- src/hotspot/share/opto/superword.hpp | 15 +- src/hotspot/share/opto/vectorization.hpp | 2 +- 5 files changed, 146 insertions(+), 78 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index da6c02f113ab8..319c5112b04c0 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -461,13 +461,24 @@ class MemPointerSummand : public StackObj { NoOverflowInt scale() const { return _scale; } static int cmp_by_variable_idx(MemPointerSummand* p1, MemPointerSummand* p2) { - if (p1->variable() == nullptr) { - return (p2->variable() == nullptr) ? 0 : 1; - } else if (p2->variable() == nullptr) { + return cmp_by_variable_idx(*p1, *p2); + } + + static int cmp_by_variable_idx(const MemPointerSummand& p1, const MemPointerSummand& p2) { + if (p1.variable() == nullptr) { + return (p2.variable() == nullptr) ? 0 : 1; + } else if (p2.variable() == nullptr) { return -1; } - return p1->variable()->_idx - p2->variable()->_idx; + return p1.variable()->_idx - p2.variable()->_idx; + } + + static int cmp(const MemPointerSummand& p1, const MemPointerSummand& p2) { + int cmp = cmp_by_variable_idx(p1, p2); + if (cmp != 0) { return cmp; } + + return NoOverflowInt::cmp(p1.scale(), p2.scale()); } friend bool operator==(const MemPointerSummand a, const MemPointerSummand b) { @@ -622,6 +633,16 @@ class MemPointerDecomposedForm : public StackObj { const NoOverflowInt con() const { return _con; } const Base& base() const { return _base; } + static int cmp_summands(const MemPointerDecomposedForm& a, const MemPointerDecomposedForm& b) { + for (int i = 0; i < SUMMANDS_SIZE; i++) { + const MemPointerSummand& s_a = a.summands_at(i); + const MemPointerSummand& s_b = b.summands_at(i); + int cmp = MemPointerSummand::cmp(s_a, s_b); + if (cmp != 0) { return cmp;} + } + return 0; + } + #ifndef PRODUCT void print_form_on(outputStream* st) const { if (_con.is_NaN()) { diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp index 9da24645b4117..302375fdbde5a 100644 --- a/src/hotspot/share/opto/noOverflowInt.hpp +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -100,6 +100,17 @@ class NoOverflowInt { return a.value() % b.value() == 0; } + static int cmp(const NoOverflowInt& a, const NoOverflowInt& b) { + if (a.is_NaN()) { + return b.is_NaN() ? 0 : 1; + } else if (b.is_NaN()) { + return -1; + } + if (a.value() < b.value()) { return -1; } + if (a.value() > b.value()) { return 1; } + return 0; + } + #ifndef PRODUCT void print_on(outputStream* st) const { if (is_NaN()) { diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 93a3420307b90..ad9a1ddab5ec9 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -25,6 +25,7 @@ #include "opto/addnode.hpp" #include "opto/castnode.hpp" #include "opto/convertnode.hpp" +#include "opto/memnode.hpp" #include "opto/superword.hpp" #include "opto/superwordVTransformBuilder.hpp" #include "opto/vectornode.hpp" @@ -502,6 +503,33 @@ bool SuperWord::SLP_extract() { return schedule_and_apply(); } +// We use two comparisons, because a subtraction could underflow. +#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ + if (a < b) { return -1; } \ + if (a > b) { return 1; } + +int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { + // Opcode + RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); + + // VPointer summands + return MemPointerDecomposedForm::cmp_summands(a->xpointer().decomposed_form(), + b->xpointer().decomposed_form()); +} + +int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { + // Group + int cmp_group = cmp_by_group(a, b); + if (cmp_group != 0) { return cmp_group; } + + // VPointer con + jint a_con = a->xpointer().decomposed_form().con().value(); + jint b_con = b->xpointer().decomposed_form().con().value(); + RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); + + return 0; +} + // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization. void SuperWord::create_adjacent_memop_pairs() { ResourceMark rm; @@ -509,15 +537,14 @@ void SuperWord::create_adjacent_memop_pairs() { collect_valid_memops(memops); - // Sort the VPointers. This does 2 things: - // - Separate the VPointer into groups: all memops that have the same opcode and the same - // VPointer, except for the offset. Adjacent memops must have the same opcode and the - // same VPointer, except for a shift in the offset. Thus, two memops can only be adjacent - // if they are in the same group. This decreases the work. - // - Sort by offset inside the groups. This decreases the work needed to determine adjacent - // memops inside a group. - assert(false, "TODO"); - //vpointers.sort(VPointer::cmp_for_sort); + // Sort the MemOps by group, and inside a group by VPointer con: + // - Group: all memops with the same opcode, and the same VPointer summands. Adjacent memops + // have the same opcode and the same VPointer summands, only the VPointer con is + // different. Thus, two memops can only be adjacent if they are in the same group. + // This decreases the work. + // - VPointer con: Sorting by VPointer con inside the group allows us to perform a sliding + // window algorithm, to determine adjacent memops efficiently. + memops.sort(MemOp::cmp_by_group_and_con); #ifndef PRODUCT if (is_trace_superword_adjacent_memops()) { @@ -525,7 +552,7 @@ void SuperWord::create_adjacent_memop_pairs() { } #endif - //create_adjacent_memop_pairs_in_all_groups(vpointers); + create_adjacent_memop_pairs_in_all_groups(memops); #ifndef PRODUCT if (is_trace_superword_packset()) { @@ -535,7 +562,7 @@ void SuperWord::create_adjacent_memop_pairs() { #endif } -// Collect all memops vpointers that could potentially be vectorized. +// Collect all memops that could potentially be vectorized. void SuperWord::collect_valid_memops(GrowableArray& memops) { for_each_mem([&] (const MemNode* mem, int bb_idx) { const XPointer& p = xpointer(mem); @@ -548,22 +575,22 @@ void SuperWord::collect_valid_memops(GrowableArray& memops) { } // For each group, find the adjacent memops. -void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray &vpointers) { +void SuperWord::create_adjacent_memop_pairs_in_all_groups(const GrowableArray& memops) { int group_start = 0; - while (group_start < vpointers.length()) { - int group_end = find_group_end(vpointers, group_start); - create_adjacent_memop_pairs_in_one_group(vpointers, group_start, group_end); + while (group_start < memops.length()) { + int group_end = find_group_end(memops, group_start); + create_adjacent_memop_pairs_in_one_group(memops, group_start, group_end); group_start = group_end; } } -// Step forward until we find a VPointer of another group, or we reach the end of the array. -int SuperWord::find_group_end(const GrowableArray& vpointers, int group_start) { +// Step forward until we find a MemOp of another group, or we reach the end of the array. +int SuperWord::find_group_end(const GrowableArray& memops, int group_start) { int group_end = group_start + 1; - while (group_end < vpointers.length() && - VPointer::cmp_for_sort_by_group( - vpointers.adr_at(group_start), - vpointers.adr_at(group_end) + while (group_end < memops.length() && + MemOp::cmp_by_group( + memops.adr_at(group_start), + memops.adr_at(group_end) ) == 0) { group_end++; } @@ -572,64 +599,66 @@ int SuperWord::find_group_end(const GrowableArray& vpointers, i // Find adjacent memops for a single group, e.g. for all LoadI of the same base, invar, etc. // Create pairs and add them to the pairset. -void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray& vpointers, const int group_start, const int group_end) { +void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray& memops, const int group_start, const int group_end) { #ifndef PRODUCT if (is_trace_superword_adjacent_memops()) { tty->print_cr(" group:"); for (int i = group_start; i < group_end; i++) { - const VPointer* p = vpointers.at(i); + const MemOp& memop = memops.at(i); + tty->print(" "); + memop.mem()->dump(); tty->print(" "); - p->print(); + memop.xpointer().print_on(tty); } } #endif - MemNode* first = vpointers.at(group_start)->mem(); - int element_size = data_size(first); - - // For each ref in group: find others that can be paired: - for (int i = group_start; i < group_end; i++) { - const VPointer* p1 = vpointers.at(i); - MemNode* mem1 = p1->mem(); - - bool found = false; - // For each ref in group with larger or equal offset: - for (int j = i + 1; j < group_end; j++) { - const VPointer* p2 = vpointers.at(j); - MemNode* mem2 = p2->mem(); - assert(mem1 != mem2, "look only at pair of different memops"); + const MemNode* first = memops.at(group_start).mem(); + const int element_size = data_size(first); - // Check for correct distance. - assert(data_size(mem1) == element_size, "all nodes in group must have the same element size"); - assert(data_size(mem2) == element_size, "all nodes in group must have the same element size"); - assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset"); - if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; } - if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; } - - // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize) - if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; } - - if (!can_pack_into_pair(mem1, mem2)) { continue; } - -#ifndef PRODUCT - if (is_trace_superword_adjacent_memops()) { - if (found) { - tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:"); - } else { - tty->print_cr(" pair:"); - } - tty->print(" "); - p1->print(); - tty->print(" "); - p2->print(); - } -#endif - - if (!found) { - _pairset.add_pair(mem1, mem2); - } - } - } +// // For each ref in group: find others that can be paired: +// for (int i = group_start; i < group_end; i++) { +// const VPointer* p1 = vpointers.at(i); +// MemNode* mem1 = p1->mem(); +// +// bool found = false; +// // For each ref in group with larger or equal offset: +// for (int j = i + 1; j < group_end; j++) { +// const VPointer* p2 = vpointers.at(j); +// MemNode* mem2 = p2->mem(); +// assert(mem1 != mem2, "look only at pair of different memops"); +// +// // Check for correct distance. +// assert(data_size(mem1) == element_size, "all nodes in group must have the same element size"); +// assert(data_size(mem2) == element_size, "all nodes in group must have the same element size"); +// assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset"); +// if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; } +// if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; } +// +// // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize) +// if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; } +// +// if (!can_pack_into_pair(mem1, mem2)) { continue; } +// +//#ifndef PRODUCT +// if (is_trace_superword_adjacent_memops()) { +// if (found) { +// tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:"); +// } else { +// tty->print_cr(" pair:"); +// } +// tty->print(" "); +// p1->print(); +// tty->print(" "); +// p2->print(); +// } +//#endif +// +// if (!found) { +// _pairset.add_pair(mem1, mem2); +// } +// } +// } } void VLoopMemorySlices::find_memory_slices() { diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 41a897a15ea67..586fc18eba8b9 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -474,7 +474,7 @@ class SuperWord : public ResourceObj { return _vloop_analyzer.types().same_velt_type(n1, n2); } - int data_size(Node* n) const { + int data_size(const Node* n) const { return _vloop_analyzer.types().data_size(n); } @@ -570,16 +570,23 @@ class SuperWord : public ResourceObj { private: const MemNode* _mem; const XPointer* _xpointer; + public: // Empty, for GrowableArray MemOp() : _mem(nullptr), _xpointer(nullptr) {} MemOp(const MemNode* mem, const XPointer* xpointer) : _mem(mem), _xpointer(xpointer) {} + + const MemNode* mem() const { return _mem; } + const XPointer& xpointer() const { return *_xpointer; } + + static int cmp_by_group(MemOp* a, MemOp* b); + static int cmp_by_group_and_con(MemOp* a, MemOp* b); }; void create_adjacent_memop_pairs(); void collect_valid_memops(GrowableArray& memops); - void create_adjacent_memop_pairs_in_all_groups(const GrowableArray& vpointers); - static int find_group_end(const GrowableArray& vpointers, int group_start); - void create_adjacent_memop_pairs_in_one_group(const GrowableArray& vpointers, const int group_start, int group_end); + void create_adjacent_memop_pairs_in_all_groups(const GrowableArray& memops); + static int find_group_end(const GrowableArray& memops, int group_start); + void create_adjacent_memop_pairs_in_one_group(const GrowableArray& memops, const int group_start, int group_end); // Various methods to check if we can pack two nodes. bool can_pack_into_pair(Node* s1, Node* s2); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 43a9ce7894490..d20c22bc26a23 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -438,7 +438,7 @@ class VLoopTypes : public StackObj { return velt_type(n)->array_element_basic_type(); } - int data_size(Node* s) const { + int data_size(const Node* s) const { int bsize = type2aelembytes(velt_basic_type(s)); assert(bsize != 0, "valid size"); return bsize; From 0096d851ea5aa5fd60fba32fb0b8db8fe02632eb Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 14:55:16 +0100 Subject: [PATCH 018/130] refactor create_adjacent_memop_pairs_in_one_group --- src/hotspot/share/opto/superword.cpp | 90 ++++++++++++------------ src/hotspot/share/opto/superword.hpp | 6 +- src/hotspot/share/opto/vectorization.hpp | 1 + 3 files changed, 49 insertions(+), 48 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index ad9a1ddab5ec9..0a1329d3f0587 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -564,7 +564,7 @@ void SuperWord::create_adjacent_memop_pairs() { // Collect all memops that could potentially be vectorized. void SuperWord::collect_valid_memops(GrowableArray& memops) { - for_each_mem([&] (const MemNode* mem, int bb_idx) { + for_each_mem([&] (MemNode* mem, int bb_idx) { const XPointer& p = xpointer(mem); if (p.is_valid() && !mem->is_LoadStore() && @@ -613,52 +613,52 @@ void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArraymem(); -// -// bool found = false; -// // For each ref in group with larger or equal offset: -// for (int j = i + 1; j < group_end; j++) { -// const VPointer* p2 = vpointers.at(j); -// MemNode* mem2 = p2->mem(); -// assert(mem1 != mem2, "look only at pair of different memops"); -// -// // Check for correct distance. -// assert(data_size(mem1) == element_size, "all nodes in group must have the same element size"); -// assert(data_size(mem2) == element_size, "all nodes in group must have the same element size"); -// assert(p1->offset_in_bytes() <= p2->offset_in_bytes(), "must be sorted by offset"); -// if (p1->offset_in_bytes() + element_size > p2->offset_in_bytes()) { continue; } -// if (p1->offset_in_bytes() + element_size < p2->offset_in_bytes()) { break; } -// -// // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize) -// if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; } -// -// if (!can_pack_into_pair(mem1, mem2)) { continue; } -// -//#ifndef PRODUCT -// if (is_trace_superword_adjacent_memops()) { -// if (found) { -// tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:"); -// } else { -// tty->print_cr(" pair:"); -// } -// tty->print(" "); -// p1->print(); -// tty->print(" "); -// p2->print(); -// } -//#endif -// -// if (!found) { -// _pairset.add_pair(mem1, mem2); -// } -// } -// } + // For each ref in group: find others that can be paired: + for (int i = group_start; i < group_end; i++) { + const XPointer& p1 = memops.at(i).xpointer(); + MemNode* mem1 = memops.at(i).mem(); + + bool found = false; + // For each ref in group with larger or equal offset: + for (int j = i + 1; j < group_end; j++) { + const XPointer& p2 = memops.at(j).xpointer(); + MemNode* mem2 = memops.at(j).mem(); + assert(mem1 != mem2, "look only at pair of different memops"); + + // Check for correct distance. + assert(data_size(mem1) == element_size, "all nodes in group must have the same element size"); + assert(data_size(mem2) == element_size, "all nodes in group must have the same element size"); + assert(p1.con_value() <= p2.con_value(), "must be sorted by offset"); + if (p1.con_value() + element_size > p2.con_value()) { continue; } + if (p1.con_value() + element_size < p2.con_value()) { break; } + + // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize) + if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; } + + if (!can_pack_into_pair(mem1, mem2)) { continue; } + +#ifndef PRODUCT + if (is_trace_superword_adjacent_memops()) { + if (found) { + tty->print_cr(" WARNING: multiple pairs with the same node. Ignored pairing:"); + } else { + tty->print_cr(" pair:"); + } + tty->print(" "); + p1.print_on(tty); + tty->print(" "); + p2.print_on(tty); + } +#endif + + if (!found) { + _pairset.add_pair(mem1, mem2); + } + } + } } void VLoopMemorySlices::find_memory_slices() { diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 586fc18eba8b9..321f89ea5df70 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -568,15 +568,15 @@ class SuperWord : public ResourceObj { // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization. class MemOp : public StackObj { private: - const MemNode* _mem; + MemNode* _mem; const XPointer* _xpointer; public: // Empty, for GrowableArray MemOp() : _mem(nullptr), _xpointer(nullptr) {} - MemOp(const MemNode* mem, const XPointer* xpointer) : _mem(mem), _xpointer(xpointer) {} + MemOp(MemNode* mem, const XPointer* xpointer) : _mem(mem), _xpointer(xpointer) {} - const MemNode* mem() const { return _mem; } + MemNode* mem() const { return _mem; } const XPointer& xpointer() const { return *_xpointer; } static int cmp_by_group(MemOp* a, MemOp* b); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index d20c22bc26a23..c69c1f780c964 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -717,6 +717,7 @@ class XPointer : public ArenaObj { bool is_valid() const { return _is_valid; } const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } jint size() const { return _size; } + jint con_value() const { return _decomposed_form.con().value(); } // Aliasing bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; From 6084b76f7e003065dd1a4e4bce6ad0a523c25277 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 15:10:37 +0100 Subject: [PATCH 019/130] XPointer::is_adjacent_to_and_before --- src/hotspot/share/opto/superword.cpp | 10 +++------- src/hotspot/share/opto/vectorization.cpp | 18 ++++++++++++++++++ src/hotspot/share/opto/vectorization.hpp | 2 ++ 3 files changed, 23 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 0a1329d3f0587..18668e78e6161 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -778,13 +778,9 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const { return false; } - // Adjacent memory references must have the same base, be comparable - // and have the correct distance between them. - const VPointer& p1 = vpointer(s1->as_Mem()); - const VPointer& p2 = vpointer(s2->as_Mem()); - if (p1.base() != p2.base() || !p1.comparable(p2)) return false; - int diff = p2.offset_in_bytes() - p1.offset_in_bytes(); - return diff == data_size(s1); + const XPointer& p1 = xpointer(s1->as_Mem()); + const XPointer& p2 = xpointer(s2->as_Mem()); + return p1.is_adjacent_to_and_before(p2, _vloop); } //------------------------------isomorphic--------------------------- diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 962a60c960ed3..2b972d9d0d7f4 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -485,6 +485,24 @@ int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) { return 0; // equal } +bool XPointer::is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const { + const MemPointerDecomposedForm& s1 = decomposed_form(); + const MemPointerDecomposedForm& s2 = other.decomposed_form(); + const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); + const bool is_adjacent = aliasing.is_always_at_distance(_size); + +#ifndef PRODUCT + if (vloop.mptrace().is_trace_adjacency()) { + tty->print("Adjacent: %s, because size = %d and aliasing = ", + is_adjacent ? "true" : "false", _size); + aliasing.print_on(tty); + tty->cr(); + } +#endif + + return is_adjacent; +} + bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) const { const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index c69c1f780c964..e4a260a3aea8e 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -720,6 +720,8 @@ class XPointer : public ArenaObj { jint con_value() const { return _decomposed_form.con().value(); } // Aliasing + // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? + bool is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const; bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; NOT_PRODUCT( void print_on(outputStream* st) const; ) From 87cb5e18e4a1cf02131b95d140a0f1f1f4ff2dce Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 8 Nov 2024 16:10:15 +0100 Subject: [PATCH 020/130] start parsing apart MemPointerDecomposedForm for XPointer --- src/hotspot/share/opto/superword.cpp | 20 ++++++++ src/hotspot/share/opto/vectorization.hpp | 58 ++++++++++++++++++++++-- src/hotspot/share/opto/vtransform.hpp | 3 ++ 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 18668e78e6161..350c910ce6620 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2710,6 +2710,26 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* orig_limit = pre_opaq->original_loop_limit(); assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, ""); + // TODO start + + const XPointer& p = xpointer(align_to_ref); + assert(p.is_valid(), "sanity"); + + // For the main-loop, we want the address of align_to_ref to be memory aligned + // with some alignment width (aw, a power of 2). When we enter the main-loop, + // we know that: + // iv = iv_main_loop_enter = limit_pre_loop + // + // We want to adjust the pre-loop limit by executing some adjust_pre_iter many + // extra iterations, and with that acheive alignment of the address. + // + // The adress has been decomposed by VPointer: + // + // pointer = base + SUM(invar_summands) + iv_scale * iv + con + // + + // TODO end + const VPointer& align_to_ref_p = vpointer(align_to_ref); assert(align_to_ref_p.valid(), "sanity"); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index e4a260a3aea8e..5c86944c72b86 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -681,26 +681,54 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// TODO +// XPointer adapts the MemPointerDecomposedForm to the use in a loop: +// +// pointer = SUM(summands) + con +// +// We define invar_summands as all summands, except those where the variable is +// the base of the memory object or the loop iv. We can thus write: +// +// pointer = base + SUM(invar_summands) + iv_scale * iv + con +// +// We check that all variables in invar_summands are pre-loop invariant. This is +// important when we need to memory align a pointer using the pre-loop limit. +// For heap objects the base is the memory object base, and for off-heap/native +// memory we set base to nullptr. If we find a summand where the variable is the +// iv, we set iv_scale to the corresponding scale. If there is no such summand, +// then we know that the pointer does not depend on the iv, since otherwise there +// would have to be a summand where its variable it main-loop variant. +// class XPointer : public ArenaObj { private: typedef MemPointerDecomposedFormParser::Callback Callback; const MemPointerDecomposedForm _decomposed_form; const jint _size; - const bool _is_valid; + + // Derived, for quicker use. + Node* _base; + const jint _iv_scale; + const jint _con_value; + + const bool _is_valid; // TODO any accessor should assert if not valid! public: // Default constructor, e.g. for GrowableArray. XPointer() : _decomposed_form(), _size(0), + _base(nullptr), + _iv_scale(0), + _con_value(0), _is_valid(false) {} template XPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), + _base(init_base(_decomposed_form)), + _iv_scale(init_iv_scale(_decomposed_form, vloop)), + _con_value(init_con_value(_decomposed_form)), _is_valid(init_is_valid(_decomposed_form, vloop)) { #ifndef PRODUCT @@ -717,7 +745,10 @@ class XPointer : public ArenaObj { bool is_valid() const { return _is_valid; } const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } jint size() const { return _size; } - jint con_value() const { return _decomposed_form.con().value(); } + Node* base() const { return _base; } + jint iv_scale() const { return _iv_scale; } + jint con_value() const { return _con_value; } + // TODO for each in invar_summands - maybe make it static so we can use it during init? // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? @@ -734,6 +765,27 @@ class XPointer : public ArenaObj { return parser.decomposed_form(); } + static Node* init_base(const MemPointerDecomposedForm& decomposed_form) { + if (!decomposed_form.base().is_known()) { return nullptr; } + return decomposed_form.base().get(); + } + + static jint init_iv_scale(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { + for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { + const MemPointerSummand& summand = decomposed_form.summands_at(i); + Node* variable = summand.variable(); + if (variable == vloop.iv()) { + return summand.scale().value(); + } + } + // No summand with variable == iv. + return 0; + } + + static jint init_con_value(const MemPointerDecomposedForm& decomposed_form) { + return decomposed_form.con().value(); // TODO can this fail - else simplify. + } + // Check that all variables are either the iv, or else invariants. // TODO why pre-loop static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index ee298e7fe723f..029ac11959f8d 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -235,6 +235,9 @@ class VTransform : public StackObj { const VPointer& vpointer(const MemNode* mem) const { return _vloop_analyzer.vpointers().vpointer(mem); } + const XPointer& xpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().xpointer(mem); + } // Ensure that the main loop vectors are aligned by adjusting the pre loop limit. void determine_mem_ref_and_aw_for_main_loop_alignment(); From 542f1c255e7318ce950779606e635f21697068ad Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 11:22:12 +0100 Subject: [PATCH 021/130] fix native parsing --- src/hotspot/share/opto/mempointer.cpp | 10 +++++++++- src/hotspot/share/opto/mempointer.hpp | 5 ++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 9430fe5b34cd5..ad8096995f720 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -304,8 +304,16 @@ bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, cons } MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::from_AddP(Node* pointer) { + // Bad form -> unknown. AddPNode* adr = pointer->isa_AddP(); - return (adr == nullptr) ? Base() : Base(true, adr->in(AddPNode::Base)); + if (adr == nullptr) { return Base(); } + + // Top base -> native. + Node* base_adr = adr->in(AddPNode::Base); + if (base_adr->is_top()) { return Base(); } + + // Known object base. + return Base(true, adr->in(AddPNode::Base)); } // Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 319c5112b04c0..b7369bcf62f65 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -521,7 +521,10 @@ class MemPointerDecomposedForm : public StackObj { bool _is_known; Node* _base; - Base(bool is_known, Node* base) : _is_known(is_known), _base(base) {} + Base(bool is_known, Node* base) : _is_known(is_known), _base(base) { + assert(is_known || base == nullptr, "base is null if not known"); + assert(!is_known || (base != nullptr && !base->is_top()), "valid known base"); + } public: Base() : Base(false, nullptr) {} From 7c5d9b07ff785e4a5f4f7dd65cdfcd0eeb3aecd9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 14:50:46 +0100 Subject: [PATCH 022/130] parse native base --- src/hotspot/share/opto/mempointer.cpp | 48 +++++++++++---- src/hotspot/share/opto/mempointer.hpp | 74 ++++++++++++------------ src/hotspot/share/opto/noOverflowInt.hpp | 1 + src/hotspot/share/opto/superword.cpp | 1 + src/hotspot/share/opto/vectorization.hpp | 9 --- 5 files changed, 78 insertions(+), 55 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index ad8096995f720..240f7b2b10344 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -163,7 +163,6 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman } case Op_CastII: case Op_CastLL: - case Op_CastX2P: case Op_ConvI2L: // On 32bit systems we can also look through ConvL2I, since the final result will always // be truncated back with ConvL2I. On 64bit systems we cannot decompose ConvL2I because @@ -180,6 +179,10 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman adr_node_callback.callback(n); return; } + case Op_CastX2P: + // In theory, we could parse through this, and further decompose. But this is also a good + // candidate for a native-memory "base". + break; default: // All other operations cannot be further decomposed. We just add them to the // terminal summands below. @@ -303,17 +306,42 @@ bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, cons #endif } -MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::from_AddP(Node* pointer) { +MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::make(Node* pointer, const GrowableArray& summands) { // Bad form -> unknown. AddPNode* adr = pointer->isa_AddP(); if (adr == nullptr) { return Base(); } - // Top base -> native. - Node* base_adr = adr->in(AddPNode::Base); - if (base_adr->is_top()) { return Base(); } + // Non-TOP base -> object. + Node* maybe_object_base = adr->in(AddPNode::Base); + bool is_object_base = !maybe_object_base->is_top(); - // Known object base. - return Base(true, adr->in(AddPNode::Base)); + Node* base = find_base(is_object_base ? maybe_object_base : nullptr, summands); + + if (base == nullptr) { + // Not found -> unknown. + return Base(); + } else if (is_object_base) { + assert(base == maybe_object_base, "we confirmed that it is in summands"); + return Base(Object, base); + } else { + return Base(Native, base); + } +} + +Node* MemPointerDecomposedForm::Base::find_base(Node* object_base, const GrowableArray& summands) { + for (int i = 0; i < summands.length(); i++) { + const MemPointerSummand& s = summands.at(i); + assert(s.variable() != nullptr, "no empty summands"); + // Object base. + if (object_base != nullptr && s.variable() == object_base && s.scale().is_one()) { + return object_base; + } + // Native base. + if (object_base == nullptr && s.variable()->Opcode() == Op_CastX2P && s.scale().is_one()) { + return s.variable(); + } + } + return nullptr; } // Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to @@ -411,14 +439,14 @@ bool MemPointerDecomposedForm::has_same_summands_as(const MemPointerDecomposedFo bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const { if (!base().is_object() || !other.base().is_object() || - base().get() == other.base().get()) { + base().object() == other.base().object()) { // The base is the same, or we do not know if the base is different. return false; } #ifdef ASSERT - const MemPointerSummand base1(base().get(), NoOverflowInt(1)); - const MemPointerSummand base2(other.base().get(), NoOverflowInt(1)); + const MemPointerSummand base1(base().object(), NoOverflowInt(1)); + const MemPointerSummand base2(other.base().object(), NoOverflowInt(1)); assert(summands_at(0) == base1 && other.summands_at(0) == base2, "bases in 0th element"); #endif diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index b7369bcf62f65..64adfe065bf8d 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -518,35 +518,44 @@ class MemPointerDecomposedForm : public StackObj { class Base : public StackObj { private: - bool _is_known; + enum Kind { Unknown, Object, Native }; + Kind _kind; Node* _base; - Base(bool is_known, Node* base) : _is_known(is_known), _base(base) { - assert(is_known || base == nullptr, "base is null if not known"); - assert(!is_known || (base != nullptr && !base->is_top()), "valid known base"); + Base(Kind kind, Node* base) : _kind(kind), _base(base) { + assert((kind == Unknown) == (base == nullptr), "known base"); } public: - Base() : Base(false, nullptr) {} + Base() : Base(Unknown, nullptr) {} + static Base make(Node* pointer, const GrowableArray& summands); - static Base from_AddP(Node* pointer); - bool is_known() const { return _is_known; } - Node* get() const { assert(is_known(), "must be"); return _base; } - bool is_object() const { return _is_known && _base != nullptr; } + bool is_known() const { return _kind != Unknown; } + bool is_object() const { return _kind == Object; } + bool is_native() const { return _kind == Native; } + Node* object() const { assert(is_object(),""); return _base; } + Node* native() const { assert(is_native(),""); return _base; } + Node* object_or_native_or_null() const { return _base; } #ifndef PRODUCT void print_on(outputStream* st) const { - if (_is_known) { - if (_base == nullptr) { - tty->print("native"); - } else { + switch (_kind) { + case Object: + tty->print("native "); tty->print("%d %s", _base->_idx, _base->Name()); - } - } else { - tty->print("unknown"); - } + break; + case Native: + tty->print("native "); + tty->print("%d %s", _base->_idx, _base->Name()); + break; + default: + tty->print("native"); + }; } #endif + + private: + static Node* find_base(Node* object_base, const GrowableArray& summands); }; private: @@ -570,7 +579,7 @@ class MemPointerDecomposedForm : public StackObj { MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) : _con(con), - _base(Base::from_AddP(pointer)) + _base(Base::make(pointer, summands)) { assert(!_con.is_NaN(), "non-NaN constant"); assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); @@ -582,28 +591,21 @@ class MemPointerDecomposedForm : public StackObj { } #endif - if (_base.is_object()) { - MemPointerSummand b(_base.get(), NoOverflowInt(1)); - if (summands.contains(b)) { - // We have a known base object, move it to the 0th summand. - _summands[0] = b; - int pos = 1; - for (int i = 0; i < summands.length(); i++) { - if (summands.at(i) == b) { continue; } - _summands[pos++] = summands.at(i); - } - return; - } else { - // We did not find the base object, reset to unknown base. - assert(false, "we should always find the base"); - _base = Base(); - } + // Put the base in in the 0th summand. + Node* base = _base.object_or_native_or_null(); + int pos = 0; + if (base != nullptr) { + MemPointerSummand b(base, NoOverflowInt(1)); + _summands[0] = b; + pos++; } - + // Put all other summands afterward. for (int i = 0; i < summands.length(); i++) { const MemPointerSummand& s = summands.at(i); - _summands[i] = s; + if (s.variable() == base && s.scale().is_one()) { continue; } + _summands[pos++] = summands.at(i); } + assert(pos == summands.length(), "copied all summands"); } public: diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp index 302375fdbde5a..6e14e446e7bec 100644 --- a/src/hotspot/share/opto/noOverflowInt.hpp +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -52,6 +52,7 @@ class NoOverflowInt { bool is_NaN() const { return _is_NaN; } jint value() const { assert(!is_NaN(), "NaN not allowed"); return _value; } bool is_zero() const { return !is_NaN() && value() == 0; } + bool is_one() const { return !is_NaN() && value() == 1; } friend NoOverflowInt operator+(const NoOverflowInt& a, const NoOverflowInt& b) { if (a.is_NaN()) { return a; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 51a8d1e61f8ab..4ddcf07420de6 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2715,6 +2715,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { const XPointer& p = xpointer(align_to_ref); assert(p.is_valid(), "sanity"); + p.print_on(tty); // For the main-loop, we want the address of align_to_ref to be memory aligned // with some alignment width (aw, a power of 2). When we enter the main-loop, diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 5898d41905f6f..ce7f518e04009 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -706,7 +706,6 @@ class XPointer : public ArenaObj { const jint _size; // Derived, for quicker use. - Node* _base; const jint _iv_scale; const jint _con_value; @@ -717,7 +716,6 @@ class XPointer : public ArenaObj { XPointer() : _decomposed_form(), _size(0), - _base(nullptr), _iv_scale(0), _con_value(0), _is_valid(false) {} @@ -726,7 +724,6 @@ class XPointer : public ArenaObj { XPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), - _base(init_base(_decomposed_form)), _iv_scale(init_iv_scale(_decomposed_form, vloop)), _con_value(init_con_value(_decomposed_form)), _is_valid(init_is_valid(_decomposed_form, vloop)) @@ -745,7 +742,6 @@ class XPointer : public ArenaObj { bool is_valid() const { return _is_valid; } const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } jint size() const { return _size; } - Node* base() const { return _base; } jint iv_scale() const { return _iv_scale; } jint con_value() const { return _con_value; } // TODO for each in invar_summands - maybe make it static so we can use it during init? @@ -765,11 +761,6 @@ class XPointer : public ArenaObj { return parser.decomposed_form(); } - static Node* init_base(const MemPointerDecomposedForm& decomposed_form) { - if (!decomposed_form.base().is_known()) { return nullptr; } - return decomposed_form.base().get(); - } - static jint init_iv_scale(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = decomposed_form.summands_at(i); From ef413287f09698abe664e66a6eea9fbe77020bbc Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 14:54:32 +0100 Subject: [PATCH 023/130] base kinds --- src/hotspot/share/opto/mempointer.hpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 64adfe065bf8d..0cec0396a83af 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -506,8 +506,9 @@ class MemPointerSummand : public StackObj { // // pointer = SUM(summands) + con // -// Note: if the base is known to be an object reference (base().is_object()), then -// the base is in the 0th summand. +// Node: if the base is known, then it is in the 0th summand. A base can be: +// - on-heap / object: base().object() +// - off-heap / native: base().native() class MemPointerDecomposedForm : public StackObj { public: // We limit the number of summands to 10. This is just a best guess, and not at this From 2180ab9fe5a74f3fd8f1667305d107f7e83d0451 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 14:58:23 +0100 Subject: [PATCH 024/130] some asserts --- src/hotspot/share/opto/vectorization.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index ce7f518e04009..839247b85c0fa 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -740,10 +740,10 @@ class XPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } - const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } - jint size() const { return _size; } - jint iv_scale() const { return _iv_scale; } - jint con_value() const { return _con_value; } + const MemPointerDecomposedForm& decomposed_form() const { assert(_is_valid, ""); return _decomposed_form; } + jint size() const { assert(_is_valid, ""); return _size; } + jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } + jint con_value() const { assert(_is_valid, ""); return _con_value; } // TODO for each in invar_summands - maybe make it static so we can use it during init? // Aliasing From d36c0a565d12a9555deec76c0529975f3aeef00a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:09:39 +0100 Subject: [PATCH 025/130] con_value -> con and additional description --- src/hotspot/share/opto/superword.cpp | 6 ++--- src/hotspot/share/opto/vectorization.hpp | 29 +++++++++++------------- 2 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 4ddcf07420de6..18235d1a12df3 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -631,9 +631,9 @@ void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArray p2.con_value()) { continue; } - if (p1.con_value() + element_size < p2.con_value()) { break; } + assert(p1.con() <= p2.con(), "must be sorted by offset"); + if (p1.con() + element_size > p2.con()) { continue; } + if (p1.con() + element_size < p2.con()) { break; } // Only allow nodes from same origin idx to be packed (see CompileCommand Option Vectorize) if (_do_vector_loop && !same_origin_idx(mem1, mem2)) { continue; } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 839247b85c0fa..1d33bf06beffe 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -686,17 +686,21 @@ class VLoopAnalyzer : StackObj { // pointer = SUM(summands) + con // // We define invar_summands as all summands, except those where the variable is -// the base of the memory object or the loop iv. We can thus write: +// the base or the loop iv. We can thus write: // // pointer = base + SUM(invar_summands) + iv_scale * iv + con // -// We check that all variables in invar_summands are pre-loop invariant. This is -// important when we need to memory align a pointer using the pre-loop limit. -// For heap objects the base is the memory object base, and for off-heap/native -// memory we set base to nullptr. If we find a summand where the variable is the -// iv, we set iv_scale to the corresponding scale. If there is no such summand, -// then we know that the pointer does not depend on the iv, since otherwise there -// would have to be a summand where its variable it main-loop variant. +// We have the following components: +// - base: +// on-heap (object base) or off-heap (native base address) +// - invar_summands: +// pre-loop invariant. This is important when we need to memory align a +// pointer using the pre-loop limit. +// - iv and iv_scale: +// If we find a summand where the variable is the iv, we set iv_scale to the +// corresponding scale. If there is no such summand, then we know that the +// pointer does not depend on the iv, since otherwise there would have to be +// a summand where its variable it main-loop variant. // class XPointer : public ArenaObj { private: @@ -707,7 +711,6 @@ class XPointer : public ArenaObj { // Derived, for quicker use. const jint _iv_scale; - const jint _con_value; const bool _is_valid; // TODO any accessor should assert if not valid! @@ -717,7 +720,6 @@ class XPointer : public ArenaObj { _decomposed_form(), _size(0), _iv_scale(0), - _con_value(0), _is_valid(false) {} template @@ -725,7 +727,6 @@ class XPointer : public ArenaObj { _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), _iv_scale(init_iv_scale(_decomposed_form, vloop)), - _con_value(init_con_value(_decomposed_form)), _is_valid(init_is_valid(_decomposed_form, vloop)) { #ifndef PRODUCT @@ -743,7 +744,7 @@ class XPointer : public ArenaObj { const MemPointerDecomposedForm& decomposed_form() const { assert(_is_valid, ""); return _decomposed_form; } jint size() const { assert(_is_valid, ""); return _size; } jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } - jint con_value() const { assert(_is_valid, ""); return _con_value; } + jint con() const { return decomposed_form().con().value(); } // TODO for each in invar_summands - maybe make it static so we can use it during init? // Aliasing @@ -773,10 +774,6 @@ class XPointer : public ArenaObj { return 0; } - static jint init_con_value(const MemPointerDecomposedForm& decomposed_form) { - return decomposed_form.con().value(); // TODO can this fail - else simplify. - } - // Check that all variables are either the iv, or else invariants. // TODO why pre-loop static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { From 3f9fa3c08b680ba9fc60d4878876be158125ea16 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:20:42 +0100 Subject: [PATCH 026/130] more comments --- src/hotspot/share/opto/superword.cpp | 20 +++++--------------- src/hotspot/share/opto/vectorization.hpp | 4 +++- 2 files changed, 8 insertions(+), 16 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 18235d1a12df3..3ace245426a13 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2717,20 +2717,10 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { assert(p.is_valid(), "sanity"); p.print_on(tty); - // For the main-loop, we want the address of align_to_ref to be memory aligned - // with some alignment width (aw, a power of 2). When we enter the main-loop, - // we know that: - // iv = iv_main_loop_enter = limit_pre_loop - // - // We want to adjust the pre-loop limit by executing some adjust_pre_iter many - // extra iterations, and with that acheive alignment of the address. - // - // The adress has been decomposed by VPointer: - // - // pointer = base + SUM(invar_summands) + iv_scale * iv + con - // - - // TODO end + // TODO rename offset -> con + // scale -> iv_scale + // adr = base + offset + invar + scale * iv (1) + // adr = base + invar + iv_scale * iv + con (1) const VPointer& align_to_ref_p = vpointer(align_to_ref); assert(align_to_ref_p.valid(), "sanity"); @@ -2741,7 +2731,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // limit by executing adjust_pre_iter many extra iterations, we can change the // alignment of the address. // - // adr = base + offset + invar + scale * iv (1) + // adr = base + invar + iv_scale * iv + con (1) // adr % aw = 0 (2) // // Note, that we are defining the modulo operator "%" such that the remainder is diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 1d33bf06beffe..e3416ff8f414b 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -688,7 +688,9 @@ class VLoopAnalyzer : StackObj { // We define invar_summands as all summands, except those where the variable is // the base or the loop iv. We can thus write: // -// pointer = base + SUM(invar_summands) + iv_scale * iv + con +// pointer = base + invar + iv_scale * iv + con +// +// invar = SUM(invar_summands) // // We have the following components: // - base: From 3b517fa5a3bc4dffe66337db28426d3519aa15e7 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:34:54 +0100 Subject: [PATCH 027/130] renamings in adjust_pre_loop_limit_to_align_main_loop_vectors --- src/hotspot/share/opto/superword.cpp | 143 +++++++++++++-------------- 1 file changed, 70 insertions(+), 73 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 3ace245426a13..f2b7864ef888b 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2717,11 +2717,6 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { assert(p.is_valid(), "sanity"); p.print_on(tty); - // TODO rename offset -> con - // scale -> iv_scale - // adr = base + offset + invar + scale * iv (1) - // adr = base + invar + iv_scale * iv + con (1) - const VPointer& align_to_ref_p = vpointer(align_to_ref); assert(align_to_ref_p.valid(), "sanity"); @@ -2751,52 +2746,52 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // iv = new_limit = old_limit + adjust_pre_iter (3a, stride > 0) // iv = new_limit = old_limit - adjust_pre_iter (3b, stride < 0) // - // We define boi as: + // We define bic as: // - // boi = base + offset + invar (4) + // bic = base + invar + con (4) // // And now we can simplify the address using (1), (3), and (4): // - // adr = boi + scale * new_limit - // adr = boi + scale * (old_limit + adjust_pre_iter) (5a, stride > 0) - // adr = boi + scale * (old_limit - adjust_pre_iter) (5b, stride < 0) + // adr = bic + iv_scale * new_limit + // adr = bic + iv_scale * (old_limit + adjust_pre_iter) (5a, stride > 0) + // adr = bic + iv_scale * (old_limit - adjust_pre_iter) (5b, stride < 0) // // And hence we can restate (2) with (5), and solve the equation for adjust_pre_iter: // - // (boi + scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, stride > 0) - // (boi + scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, stride < 0) + // (bic + iv_scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, stride > 0) + // (bic + iv_scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, stride < 0) // - // In most cases, scale is the element size, for example: + // In most cases, iv_scale is the element size, for example: // // for (i = 0; i < a.length; i++) { a[i] = ...; } // - // It is thus reasonable to assume that both abs(scale) and abs(stride) are + // It is thus reasonable to assume that both abs(iv_scale) and abs(stride) are // strictly positive powers of 2. Further, they can be assumed to be non-zero, // otherwise the address does not depend on iv, and the alignment cannot be // affected by adjusting the pre-loop limit. // - // Further, if abs(scale) >= aw, then adjust_pre_iter has no effect on alignment, and - // we are not able to affect the alignment at all. Hence, we require abs(scale) < aw. + // Further, if abs(iv_scale) >= aw, then adjust_pre_iter has no effect on alignment, and + // we are not able to affect the alignment at all. Hence, we require abs(iv_scale) < aw. // - // Moreover, for alignment to be achievable, boi must be a multiple of scale. If strict + // Moreover, for alignment to be achievable, bic must be a multiple of iv_scale. If strict // alignment is required (i.e. -XX:+AlignVector), this is guaranteed by the filtering // done with the AlignmentSolver / AlignmentSolution. If strict alignment is not // required, then alignment is still preferable for performance, but not necessary. - // In many cases boi will be a multiple of scale, but if it is not, then the adjustment + // In many cases bic will be a multiple of iv_scale, but if it is not, then the adjustment // does not guarantee alignment, but the code is still correct. // - // Hence, in what follows we assume that boi is a multiple of scale, and in fact all - // terms in (6) are multiples of scale. Therefore we divide all terms by scale: + // Hence, in what follows we assume that bic is a multiple of iv_scale, and in fact all + // terms in (6) are multiples of iv_scale. Therefore we divide all terms by iv_scale: // - // AW = aw / abs(scale) (power of 2) (7) - // BOI = boi / abs(scale) (8) + // AW = aw / abs(iv_scale) (power of 2) (7) + // BIC = bic / abs(iv_scale) (8) // - // and restate (6), using (7) and (8), i.e. we divide (6) by abs(scale): + // and restate (6), using (7) and (8), i.e. we divide (6) by abs(iv_scale): // - // (BOI + sign(scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, stride > 0) - // (BOI + sign(scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, stride < 0) + // (BIC + sign(iv_scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, stride > 0) + // (BIC + sign(iv_scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, stride < 0) // - // where: sign(scale) = scale / abs(scale) = (scale > 0 ? 1 : -1) + // where: sign(iv_scale) = iv_scale / abs(iv_scale) = (iv_scale > 0 ? 1 : -1) // // Note, (9) allows for periodic solutions of adjust_pre_iter, with periodicity AW. // But we would like to spend as few iterations in the pre-loop as possible, @@ -2806,40 +2801,40 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // We solve (9) for adjust_pre_iter, in the following 4 cases: // - // Case A: scale > 0 && stride > 0 (i.e. sign(scale) = 1) - // (BOI + old_limit + adjust_pre_iter) % AW = 0 - // adjust_pre_iter = (-BOI - old_limit) % AW (11a) + // Case A: iv_scale > 0 && stride > 0 (i.e. sign(iv_scale) = 1) + // (BIC + old_limit + adjust_pre_iter) % AW = 0 + // adjust_pre_iter = (-BIC - old_limit) % AW (11a) // - // Case B: scale < 0 && stride > 0 (i.e. sign(scale) = -1) - // (BOI - old_limit - adjust_pre_iter) % AW = 0 - // adjust_pre_iter = (BOI - old_limit) % AW (11b) + // Case B: iv_scale < 0 && stride > 0 (i.e. sign(iv_scale) = -1) + // (BIC - old_limit - adjust_pre_iter) % AW = 0 + // adjust_pre_iter = (BIC - old_limit) % AW (11b) // - // Case C: scale > 0 && stride < 0 (i.e. sign(scale) = 1) - // (BOI + old_limit - adjust_pre_iter) % AW = 0 - // adjust_pre_iter = (BOI + old_limit) % AW (11c) + // Case C: iv_scale > 0 && stride < 0 (i.e. sign(iv_scale) = 1) + // (BIC + old_limit - adjust_pre_iter) % AW = 0 + // adjust_pre_iter = (BIC + old_limit) % AW (11c) // - // Case D: scale < 0 && stride < 0 (i.e. sign(scale) = -1) - // (BOI - old_limit + adjust_pre_iter) % AW = 0 - // adjust_pre_iter = (-BOI + old_limit) % AW (11d) + // Case D: iv_scale < 0 && stride < 0 (i.e. sign(iv_scale) = -1) + // (BIC - old_limit + adjust_pre_iter) % AW = 0 + // adjust_pre_iter = (-BIC + old_limit) % AW (11d) // // We now generalize the equations (11*) by using: // - // OP: (stride > 0) ? SUB : ADD - // XBOI: (stride * scale > 0) ? -BOI : BOI + // OP: (stride > 0) ? SUB : ADD + // XBIC: (stride * iv_scale > 0) ? -BIC : BIC // // which gives us the final pre-loop limit adjustment: // - // adjust_pre_iter = (XBOI OP old_limit) % AW (12) + // adjust_pre_iter = (XBIC OP old_limit) % AW (12) // - // We can construct XBOI by additionally defining: + // We can construct XBIC by additionally defining: // - // xboi = (stride * scale > 0) ? -boi : boi (13) + // xbic = (stride * iv_scale > 0) ? -bic : bic (13) // // which gives us: // - // XBOI = (stride * scale > 0) ? -BOI : BOI - // = (stride * scale > 0) ? -boi / abs(scale) : boi / abs(scale) - // = xboi / abs(scale) (14) + // XBIC = (stride * iv_scale > 0) ? -BIC : BIC + // = (stride * iv_scale > 0) ? -bic / abs(iv_scale) : bic / abs(iv_scale) + // = xbic / abs(iv_scale) (14) // // When we have computed adjust_pre_iter, we update the pre-loop limit // with (3a, b). However, we have to make sure that the adjust_pre_iter @@ -2852,6 +2847,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit) // = MAX(new_limit, orig_limit) (15a, stride < 0) // + // TODO rename scale -> iv_scale + // offset -> con const int stride = iv_stride(); const int scale = align_to_ref_p.scale_in_bytes(); const int offset = align_to_ref_p.offset_in_bytes(); @@ -2908,13 +2905,13 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { #endif // 1: Compute (13a, b): - // xboi = -boi = (-base - offset - invar) (stride * scale > 0) - // xboi = +boi = (+base + offset + invar) (stride * scale < 0) + // xbic = -bic = (-base - offset - invar) (stride * scale > 0) + // xbic = +bic = (+base + offset + invar) (stride * scale < 0) const bool is_sub = scale * stride > 0; // 1.1: offset - Node* xboi = igvn().intcon(is_sub ? -offset : offset); - TRACE_ALIGN_VECTOR_NODE(xboi); + Node* xbic = igvn().intcon(is_sub ? -offset : offset); + TRACE_ALIGN_VECTOR_NODE(xbic); // 1.2: invar (if it exists) if (invar != nullptr) { @@ -2927,12 +2924,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { TRACE_ALIGN_VECTOR_NODE(invar); } if (is_sub) { - xboi = new SubINode(xboi, invar); + xbic = new SubINode(xbic, invar); } else { - xboi = new AddINode(xboi, invar); + xbic = new AddINode(xbic, invar); } - phase()->register_new_node(xboi, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(xboi); + phase()->register_new_node(xbic, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(xbic); } // 1.3: base (unless base is guaranteed aw aligned) @@ -2949,44 +2946,44 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { TRACE_ALIGN_VECTOR_NODE(xbase); #endif if (is_sub) { - xboi = new SubINode(xboi, xbase); + xbic = new SubINode(xbic, xbase); } else { - xboi = new AddINode(xboi, xbase); + xbic = new AddINode(xbic, xbase); } - phase()->register_new_node(xboi, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(xboi); + phase()->register_new_node(xbic, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(xbic); } // 2: Compute (14): - // XBOI = xboi / abs(scale) + // XBIC = xbic / abs(scale) // The division is executed as shift Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale))); - Node* XBOI = new URShiftINode(xboi, log2_abs_scale); - phase()->register_new_node(XBOI, pre_ctrl); + Node* XBIC = new URShiftINode(xbic, log2_abs_scale); + phase()->register_new_node(XBIC, pre_ctrl); TRACE_ALIGN_VECTOR_NODE(log2_abs_scale); - TRACE_ALIGN_VECTOR_NODE(XBOI); + TRACE_ALIGN_VECTOR_NODE(XBIC); // 3: Compute (12): - // adjust_pre_iter = (XBOI OP old_limit) % AW + // adjust_pre_iter = (XBIC OP old_limit) % AW // - // 3.1: XBOI_OP_old_limit = XBOI OP old_limit - Node* XBOI_OP_old_limit = nullptr; + // 3.1: XBIC_OP_old_limit = XBIC OP old_limit + Node* XBIC_OP_old_limit = nullptr; if (stride > 0) { - XBOI_OP_old_limit = new SubINode(XBOI, old_limit); + XBIC_OP_old_limit = new SubINode(XBIC, old_limit); } else { - XBOI_OP_old_limit = new AddINode(XBOI, old_limit); + XBIC_OP_old_limit = new AddINode(XBIC, old_limit); } - phase()->register_new_node(XBOI_OP_old_limit, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(XBOI_OP_old_limit); + phase()->register_new_node(XBIC_OP_old_limit, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(XBIC_OP_old_limit); // 3.2: Compute: - // adjust_pre_iter = (XBOI OP old_limit) % AW - // = XBOI_OP_old_limit % AW - // = XBOI_OP_old_limit AND (AW - 1) + // adjust_pre_iter = (XBIC OP old_limit) % AW + // = XBIC_OP_old_limit % AW + // = XBIC_OP_old_limit AND (AW - 1) // Since AW is a power of 2, the modulo operation can be replaced with // a bitmask operation. Node* mask_AW = igvn().intcon(AW-1); - Node* adjust_pre_iter = new AndINode(XBOI_OP_old_limit, mask_AW); + Node* adjust_pre_iter = new AndINode(XBIC_OP_old_limit, mask_AW); phase()->register_new_node(adjust_pre_iter, pre_ctrl); TRACE_ALIGN_VECTOR_NODE(mask_AW); TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter); From 62b6ec0916cbdf73daa1a5c1c4eb8e8010da49f2 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:39:26 +0100 Subject: [PATCH 028/130] remane offset -> con --- src/hotspot/share/opto/superword.cpp | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index f2b7864ef888b..36c2628acae58 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2848,10 +2848,9 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // = MAX(new_limit, orig_limit) (15a, stride < 0) // // TODO rename scale -> iv_scale - // offset -> con const int stride = iv_stride(); const int scale = align_to_ref_p.scale_in_bytes(); - const int offset = align_to_ref_p.offset_in_bytes(); + const int con = align_to_ref_p.offset_in_bytes(); Node* base = align_to_ref_p.adr(); Node* invar = align_to_ref_p.invar(); @@ -2860,10 +2859,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors:"); tty->print(" align_to_ref:"); align_to_ref->dump(); + tty->print(" "); + p.print_on(tty); tty->print_cr(" aw: %d", aw); tty->print_cr(" stride: %d", stride); tty->print_cr(" scale: %d", scale); - tty->print_cr(" offset: %d", offset); + tty->print_cr(" con: %d", con); tty->print(" base:"); base->dump(); if (invar == nullptr) { @@ -2905,12 +2906,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { #endif // 1: Compute (13a, b): - // xbic = -bic = (-base - offset - invar) (stride * scale > 0) - // xbic = +bic = (+base + offset + invar) (stride * scale < 0) + // xbic = -bic = (-base - invar - con) (stride * scale > 0) + // xbic = +bic = (+base + invar + con) (stride * scale < 0) const bool is_sub = scale * stride > 0; - // 1.1: offset - Node* xbic = igvn().intcon(is_sub ? -offset : offset); + // 1.1: con + Node* xbic = igvn().intcon(is_sub ? -con : con); TRACE_ALIGN_VECTOR_NODE(xbic); // 1.2: invar (if it exists) From b548aa4681255546c01d7e3a67b7e18f2933b3d7 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:43:34 +0100 Subject: [PATCH 029/130] rename scale -> iv_scale --- src/hotspot/share/opto/superword.cpp | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 36c2628acae58..7304507bd643e 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2849,7 +2849,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // TODO rename scale -> iv_scale const int stride = iv_stride(); - const int scale = align_to_ref_p.scale_in_bytes(); + const int iv_scale = align_to_ref_p.scale_in_bytes(); const int con = align_to_ref_p.offset_in_bytes(); Node* base = align_to_ref_p.adr(); Node* invar = align_to_ref_p.invar(); @@ -2863,7 +2863,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { p.print_on(tty); tty->print_cr(" aw: %d", aw); tty->print_cr(" stride: %d", stride); - tty->print_cr(" scale: %d", scale); + tty->print_cr(" iv_scale: %d", iv_scale); tty->print_cr(" con: %d", con); tty->print(" base:"); base->dump(); @@ -2880,35 +2880,35 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { } #endif - if (stride == 0 || !is_power_of_2(abs(stride)) || - scale == 0 || !is_power_of_2(abs(scale)) || - abs(scale) >= aw) { + if (stride == 0 || !is_power_of_2(abs(stride)) || + iv_scale == 0 || !is_power_of_2(abs(iv_scale)) || // TODO abs ok? + abs(iv_scale) >= aw) { #ifdef ASSERT if (_trace._align_vector) { tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because"); - tty->print_cr(" stride or scale are not power of 2, or abs(scale) >= aw."); + tty->print_cr(" stride or iv_scale are not power of 2, or abs(iv_scale) >= aw."); } #endif // Cannot affect alignment, abort. return; } - assert(stride != 0 && is_power_of_2(abs(stride)) && - scale != 0 && is_power_of_2(abs(scale)) && - abs(scale) < aw, "otherwise we cannot affect alignment with pre-loop"); + assert(stride != 0 && is_power_of_2(abs(stride)) && + iv_scale != 0 && is_power_of_2(abs(iv_scale)) && + abs(iv_scale) < aw, "otherwise we cannot affect alignment with pre-loop"); - const int AW = aw / abs(scale); + const int AW = aw / abs(iv_scale); #ifdef ASSERT if (_trace._align_vector) { - tty->print_cr(" AW = aw(%d) / abs(scale(%d)) = %d", aw, scale, AW); + tty->print_cr(" AW = aw(%d) / abs(iv_scale(%d)) = %d", aw, iv_scale, AW); } #endif // 1: Compute (13a, b): - // xbic = -bic = (-base - invar - con) (stride * scale > 0) - // xbic = +bic = (+base + invar + con) (stride * scale < 0) - const bool is_sub = scale * stride > 0; + // xbic = -bic = (-base - invar - con) (stride * iv_scale > 0) + // xbic = +bic = (+base + invar + con) (stride * iv_scale < 0) + const bool is_sub = iv_scale * stride > 0; // 1.1: con Node* xbic = igvn().intcon(is_sub ? -con : con); @@ -2956,12 +2956,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { } // 2: Compute (14): - // XBIC = xbic / abs(scale) + // XBIC = xbic / abs(iv_scale) // The division is executed as shift - Node* log2_abs_scale = igvn().intcon(exact_log2(abs(scale))); - Node* XBIC = new URShiftINode(xbic, log2_abs_scale); + Node* log2_abs_iv_scale = igvn().intcon(exact_log2(abs(iv_scale))); + Node* XBIC = new URShiftINode(xbic, log2_abs_iv_scale); phase()->register_new_node(XBIC, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(log2_abs_scale); + TRACE_ALIGN_VECTOR_NODE(log2_abs_iv_scale); TRACE_ALIGN_VECTOR_NODE(XBIC); // 3: Compute (12): From 5fdd95810eceaf610f21f22441e6f5ca0f77cc52 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:53:46 +0100 Subject: [PATCH 030/130] use XPointer in adjust_pre_loop_limit_to_align_main_loop_vectors, WIP --- src/hotspot/share/opto/mempointer.hpp | 11 ++++++----- src/hotspot/share/opto/superword.cpp | 23 +++++++++-------------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 0cec0396a83af..8c7a362e3ac1e 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -531,11 +531,12 @@ class MemPointerDecomposedForm : public StackObj { Base() : Base(Unknown, nullptr) {} static Base make(Node* pointer, const GrowableArray& summands); - bool is_known() const { return _kind != Unknown; } - bool is_object() const { return _kind == Object; } - bool is_native() const { return _kind == Native; } - Node* object() const { assert(is_object(),""); return _base; } - Node* native() const { assert(is_native(),""); return _base; } + bool is_known() const { return _kind != Unknown; } + bool is_object() const { return _kind == Object; } + bool is_native() const { return _kind == Native; } + Node* object() const { assert(is_object(), ""); return _base; } + Node* native() const { assert(is_native(), ""); return _base; } + Node* object_or_native() const { assert(is_known(), ""); return _base; } Node* object_or_native_or_null() const { return _base; } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 7304507bd643e..b46dd92ebf589 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2711,15 +2711,10 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* orig_limit = pre_opaq->original_loop_limit(); assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, ""); - // TODO start - const XPointer& p = xpointer(align_to_ref); assert(p.is_valid(), "sanity"); - p.print_on(tty); - - const VPointer& align_to_ref_p = vpointer(align_to_ref); - assert(align_to_ref_p.valid(), "sanity"); + // TODO rename stride -> iv_stride // For the main-loop, we want the address of align_to_ref to be memory aligned // with some alignment width (aw, a power of 2). When we enter the main-loop, // we know that iv is equal to the pre-loop limit. If we adjust the pre-loop @@ -2819,7 +2814,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // We now generalize the equations (11*) by using: // - // OP: (stride > 0) ? SUB : ADD + // OP: (stride > 0) ? SUB : ADD // XBIC: (stride * iv_scale > 0) ? -BIC : BIC // // which gives us the final pre-loop limit adjustment: @@ -2847,12 +2842,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit) // = MAX(new_limit, orig_limit) (15a, stride < 0) // - // TODO rename scale -> iv_scale - const int stride = iv_stride(); - const int iv_scale = align_to_ref_p.scale_in_bytes(); - const int con = align_to_ref_p.offset_in_bytes(); - Node* base = align_to_ref_p.adr(); - Node* invar = align_to_ref_p.invar(); + const int stride = iv_stride(); + const int iv_scale = p.iv_scale(); + const int con = p.con(); + Node* base = p.decomposed_form().base().object_or_native(); + bool is_base_native = p.decomposed_form().base().is_native(); + Node* invar = nullptr; // TODO #ifdef ASSERT if (_trace._align_vector) { @@ -2934,7 +2929,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { } // 1.3: base (unless base is guaranteed aw aligned) - if (aw > ObjectAlignmentInBytes || align_to_ref_p.base()->is_top()) { + if (aw > ObjectAlignmentInBytes || is_base_native) { // The base is only aligned with ObjectAlignmentInBytes with arrays. // When the base() is top, we have no alignment guarantee at all. // Hence, we must now take the base into account for the calculation. From 960c1f574087675bd9642bc2e5b530f41628d50e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 15:55:20 +0100 Subject: [PATCH 031/130] add TODO --- src/hotspot/share/opto/superword.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index b46dd92ebf589..42f6157403d91 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2849,6 +2849,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { bool is_base_native = p.decomposed_form().base().is_native(); Node* invar = nullptr; // TODO + // TODO: maybe use NoOverflowInt here, and for solver? + #ifdef ASSERT if (_trace._align_vector) { tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors:"); From 81bc5b4736b778578b5ae76dcfe150df70169ca3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 16:07:59 +0100 Subject: [PATCH 032/130] fix invalid case for XPointer::never_overlaps_with --- src/hotspot/share/opto/vectorization.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index bc68d521e7ff6..266cc0a926d5e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -504,6 +504,16 @@ bool XPointer::is_adjacent_to_and_before(const XPointer& other, const VLoop& vlo } bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) const { + if (!is_valid() || !other.is_valid()) { +#ifndef PRODUCT + if (vloop.mptrace().is_trace_overlap()) { + tty->print_cr("Never Overlap: false, because of invalid XPointer."); + } +#endif + + return false; + } + const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); From 1e0cc13be7faeacf1d3137a3e80f410bd2591860 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 16:20:48 +0100 Subject: [PATCH 033/130] rm unnecessary assert --- src/hotspot/share/opto/vectorization.hpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index e3416ff8f414b..e07bd1ae70905 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -781,7 +781,8 @@ class XPointer : public ArenaObj { static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { if (!decomposed_form.base().is_known()) { // XPointer needs to know if it is native (off-heap) or object (on-heap). - assert(false, "TODO find me!"); + // We may for example have failed to fully decompose the MemPointer, possibly + // because such a decomposition is not considered safe. return false; } From 8644ff7273240b51fbacfa52aef3a1b2eb49d88a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 16:30:34 +0100 Subject: [PATCH 034/130] rename stride -> iv_stride --- src/hotspot/share/opto/superword.cpp | 93 ++++++++++++++-------------- 1 file changed, 46 insertions(+), 47 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 42f6157403d91..0c69cf33505d5 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2714,7 +2714,6 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { const XPointer& p = xpointer(align_to_ref); assert(p.is_valid(), "sanity"); - // TODO rename stride -> iv_stride // For the main-loop, we want the address of align_to_ref to be memory aligned // with some alignment width (aw, a power of 2). When we enter the main-loop, // we know that iv is equal to the pre-loop limit. If we adjust the pre-loop @@ -2738,8 +2737,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // We want to find adjust_pre_iter, such that the address is aligned when entering // the main-loop: // - // iv = new_limit = old_limit + adjust_pre_iter (3a, stride > 0) - // iv = new_limit = old_limit - adjust_pre_iter (3b, stride < 0) + // iv = new_limit = old_limit + adjust_pre_iter (3a, iv_stride > 0) + // iv = new_limit = old_limit - adjust_pre_iter (3b, iv_stride < 0) // // We define bic as: // @@ -2748,19 +2747,19 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // And now we can simplify the address using (1), (3), and (4): // // adr = bic + iv_scale * new_limit - // adr = bic + iv_scale * (old_limit + adjust_pre_iter) (5a, stride > 0) - // adr = bic + iv_scale * (old_limit - adjust_pre_iter) (5b, stride < 0) + // adr = bic + iv_scale * (old_limit + adjust_pre_iter) (5a, iv_stride > 0) + // adr = bic + iv_scale * (old_limit - adjust_pre_iter) (5b, iv_stride < 0) // // And hence we can restate (2) with (5), and solve the equation for adjust_pre_iter: // - // (bic + iv_scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, stride > 0) - // (bic + iv_scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, stride < 0) + // (bic + iv_scale * (old_limit + adjust_pre_iter) % aw = 0 (6a, iv_stride > 0) + // (bic + iv_scale * (old_limit - adjust_pre_iter) % aw = 0 (6b, iv_stride < 0) // // In most cases, iv_scale is the element size, for example: // // for (i = 0; i < a.length; i++) { a[i] = ...; } // - // It is thus reasonable to assume that both abs(iv_scale) and abs(stride) are + // It is thus reasonable to assume that both abs(iv_scale) and abs(iv_stride) are // strictly positive powers of 2. Further, they can be assumed to be non-zero, // otherwise the address does not depend on iv, and the alignment cannot be // affected by adjusting the pre-loop limit. @@ -2783,8 +2782,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // and restate (6), using (7) and (8), i.e. we divide (6) by abs(iv_scale): // - // (BIC + sign(iv_scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, stride > 0) - // (BIC + sign(iv_scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, stride < 0) + // (BIC + sign(iv_scale) * (old_limit + adjust_pre_iter) % AW = 0 (9a, iv_stride > 0) + // (BIC + sign(iv_scale) * (old_limit - adjust_pre_iter) % AW = 0 (9b, iv_stride < 0) // // where: sign(iv_scale) = iv_scale / abs(iv_scale) = (iv_scale > 0 ? 1 : -1) // @@ -2796,26 +2795,26 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // We solve (9) for adjust_pre_iter, in the following 4 cases: // - // Case A: iv_scale > 0 && stride > 0 (i.e. sign(iv_scale) = 1) + // Case A: iv_scale > 0 && iv_stride > 0 (i.e. sign(iv_scale) = 1) // (BIC + old_limit + adjust_pre_iter) % AW = 0 // adjust_pre_iter = (-BIC - old_limit) % AW (11a) // - // Case B: iv_scale < 0 && stride > 0 (i.e. sign(iv_scale) = -1) + // Case B: iv_scale < 0 && iv_stride > 0 (i.e. sign(iv_scale) = -1) // (BIC - old_limit - adjust_pre_iter) % AW = 0 // adjust_pre_iter = (BIC - old_limit) % AW (11b) // - // Case C: iv_scale > 0 && stride < 0 (i.e. sign(iv_scale) = 1) + // Case C: iv_scale > 0 && iv_stride < 0 (i.e. sign(iv_scale) = 1) // (BIC + old_limit - adjust_pre_iter) % AW = 0 // adjust_pre_iter = (BIC + old_limit) % AW (11c) // - // Case D: iv_scale < 0 && stride < 0 (i.e. sign(iv_scale) = -1) + // Case D: iv_scale < 0 && iv_stride < 0 (i.e. sign(iv_scale) = -1) // (BIC - old_limit + adjust_pre_iter) % AW = 0 // adjust_pre_iter = (-BIC + old_limit) % AW (11d) // // We now generalize the equations (11*) by using: // - // OP: (stride > 0) ? SUB : ADD - // XBIC: (stride * iv_scale > 0) ? -BIC : BIC + // OP: (iv_stride > 0) ? SUB : ADD + // XBIC: (iv_stride * iv_scale > 0) ? -BIC : BIC // // which gives us the final pre-loop limit adjustment: // @@ -2823,12 +2822,12 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // We can construct XBIC by additionally defining: // - // xbic = (stride * iv_scale > 0) ? -bic : bic (13) + // xbic = (iv_stride * iv_scale > 0) ? -bic : bic (13) // // which gives us: // - // XBIC = (stride * iv_scale > 0) ? -BIC : BIC - // = (stride * iv_scale > 0) ? -bic / abs(iv_scale) : bic / abs(iv_scale) + // XBIC = (iv_stride * iv_scale > 0) ? -BIC : BIC + // = (iv_stride * iv_scale > 0) ? -bic / abs(iv_scale) : bic / abs(iv_scale) // = xbic / abs(iv_scale) (14) // // When we have computed adjust_pre_iter, we update the pre-loop limit @@ -2838,16 +2837,16 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // the loop. Hence, we must constrain the updated limit as follows: // // constrained_limit = MIN(old_limit + adjust_pre_iter, orig_limit) - // = MIN(new_limit, orig_limit) (15a, stride > 0) + // = MIN(new_limit, orig_limit) (15a, iv_stride > 0) // constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit) - // = MAX(new_limit, orig_limit) (15a, stride < 0) + // = MAX(new_limit, orig_limit) (15a, iv_stride < 0) // - const int stride = iv_stride(); - const int iv_scale = p.iv_scale(); - const int con = p.con(); - Node* base = p.decomposed_form().base().object_or_native(); - bool is_base_native = p.decomposed_form().base().is_native(); - Node* invar = nullptr; // TODO + const int iv_stride = iv_stride(); + const int iv_scale = p.iv_scale(); + const int con = p.con(); + Node* base = p.decomposed_form().base().object_or_native(); + bool is_base_native = p.decomposed_form().base().is_native(); + Node* invar = nullptr; // TODO // TODO: maybe use NoOverflowInt here, and for solver? @@ -2858,10 +2857,10 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { align_to_ref->dump(); tty->print(" "); p.print_on(tty); - tty->print_cr(" aw: %d", aw); - tty->print_cr(" stride: %d", stride); - tty->print_cr(" iv_scale: %d", iv_scale); - tty->print_cr(" con: %d", con); + tty->print_cr(" aw: %d", aw); + tty->print_cr(" iv_stride: %d", iv_stride); + tty->print_cr(" iv_scale: %d", iv_scale); + tty->print_cr(" con: %d", con); tty->print(" base:"); base->dump(); if (invar == nullptr) { @@ -2877,21 +2876,21 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { } #endif - if (stride == 0 || !is_power_of_2(abs(stride)) || - iv_scale == 0 || !is_power_of_2(abs(iv_scale)) || // TODO abs ok? + if (iv_stride == 0 || !is_power_of_2(abs(iv_stride)) || + iv_scale == 0 || !is_power_of_2(abs(iv_scale)) || // TODO abs ok? abs(iv_scale) >= aw) { #ifdef ASSERT if (_trace._align_vector) { tty->print_cr(" Alignment cannot be affected by changing pre-loop limit because"); - tty->print_cr(" stride or iv_scale are not power of 2, or abs(iv_scale) >= aw."); + tty->print_cr(" iv_stride or iv_scale are not power of 2, or abs(iv_scale) >= aw."); } #endif // Cannot affect alignment, abort. return; } - assert(stride != 0 && is_power_of_2(abs(stride)) && - iv_scale != 0 && is_power_of_2(abs(iv_scale)) && + assert(iv_stride != 0 && is_power_of_2(abs(iv_stride)) && + iv_scale != 0 && is_power_of_2(abs(iv_scale)) && abs(iv_scale) < aw, "otherwise we cannot affect alignment with pre-loop"); const int AW = aw / abs(iv_scale); @@ -2903,9 +2902,9 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { #endif // 1: Compute (13a, b): - // xbic = -bic = (-base - invar - con) (stride * iv_scale > 0) - // xbic = +bic = (+base + invar + con) (stride * iv_scale < 0) - const bool is_sub = iv_scale * stride > 0; + // xbic = -bic = (-base - invar - con) (iv_stride * iv_scale > 0) + // xbic = +bic = (+base + invar + con) (iv_stride * iv_scale < 0) + const bool is_sub = iv_scale * iv_stride > 0; // 1.1: con Node* xbic = igvn().intcon(is_sub ? -con : con); @@ -2966,7 +2965,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // // 3.1: XBIC_OP_old_limit = XBIC OP old_limit Node* XBIC_OP_old_limit = nullptr; - if (stride > 0) { + if (iv_stride > 0) { XBIC_OP_old_limit = new SubINode(XBIC, old_limit); } else { XBIC_OP_old_limit = new AddINode(XBIC, old_limit); @@ -2994,8 +2993,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // range, and adjusts the main-loop limit so that we exit the main-loop // before we leave the "safe" range. After RCE, the range of the main-loop // can only be safely narrowed, and should never be widened. Hence, the - // pre-loop limit can only be increased (for stride > 0), but an add - // overflow might decrease it, or decreased (for stride < 0), but a sub + // pre-loop limit can only be increased (for iv_stride > 0), but an add + // overflow might decrease it, or decreased (for iv_stride < 0), but a sub // underflow might increase it. To prevent that, we perform the Sub / Add // and Max / Min with long operations. old_limit = new ConvI2LNode(old_limit); @@ -3009,11 +3008,11 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { TRACE_ALIGN_VECTOR_NODE(adjust_pre_iter); // 5: Compute (3a, b): - // new_limit = old_limit + adjust_pre_iter (stride > 0) - // new_limit = old_limit - adjust_pre_iter (stride < 0) + // new_limit = old_limit + adjust_pre_iter (iv_stride > 0) + // new_limit = old_limit - adjust_pre_iter (iv_stride < 0) // Node* new_limit = nullptr; - if (stride < 0) { + if (iv_stride < 0) { new_limit = new SubLNode(old_limit, adjust_pre_iter); } else { new_limit = new AddLNode(old_limit, adjust_pre_iter); @@ -3024,8 +3023,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // 6: Compute (15a, b): // Prevent pre-loop from going past the original limit of the loop. Node* constrained_limit = - (stride > 0) ? (Node*) new MinLNode(phase()->C, new_limit, orig_limit) - : (Node*) new MaxLNode(phase()->C, new_limit, orig_limit); + (iv_stride > 0) ? (Node*) new MinLNode(phase()->C, new_limit, orig_limit) + : (Node*) new MaxLNode(phase()->C, new_limit, orig_limit); phase()->register_new_node(constrained_limit, pre_ctrl); TRACE_ALIGN_VECTOR_NODE(constrained_limit); From 67744010d540e88e1436818b9352d1b347463b07 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 21 Nov 2024 16:32:12 +0100 Subject: [PATCH 035/130] fix build --- src/hotspot/share/opto/superword.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 0c69cf33505d5..d53848ff5640a 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2841,7 +2841,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // constrained_limit = MAX(old_limit - adjust_pre_iter, orig_limit) // = MAX(new_limit, orig_limit) (15a, iv_stride < 0) // - const int iv_stride = iv_stride(); + const int iv_stride = this->iv_stride(); const int iv_scale = p.iv_scale(); const int con = p.con(); Node* base = p.decomposed_form().base().object_or_native(); From ac115c5153a731f77ad08e88d0c8f5113cd6fb6f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 07:59:17 +0100 Subject: [PATCH 036/130] pipe in XPointer to AlignmentSolver --- src/hotspot/share/opto/superword.cpp | 10 +++------- src/hotspot/share/opto/vectorization.hpp | 24 ++++++++++++------------ 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index d53848ff5640a..d0a6407b0f190 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1479,17 +1479,13 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs"); const MemNode* mem_ref = pack->at(0)->as_Mem(); - const VPointer& mem_ref_p = vpointer(mem_ref); + const XPointer& mem_ref_p = xpointer(mem_ref); const CountedLoopEndNode* pre_end = _vloop.pre_loop_end(); assert(pre_end->stride_is_con(), "pre loop stride is constant"); - AlignmentSolver solver(pack->at(0)->as_Mem(), + AlignmentSolver solver(mem_ref_p, + pack->at(0)->as_Mem(), pack->size(), - mem_ref_p.base(), - mem_ref_p.offset_in_bytes(), - mem_ref_p.invar(), - mem_ref_p.invar_factor(), - mem_ref_p.scale_in_bytes(), pre_end->init_trip(), pre_end->stride_con(), iv_stride() diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index e07bd1ae70905..ebb2a724f8ff7 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1270,6 +1270,9 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // a compatible solutions. class AlignmentSolver { private: + const XPointer& _xpointer; + + // TODO rm? const MemNode* _mem_ref; // first element const uint _vector_length; // number of elements in vector const int _element_size; @@ -1314,28 +1317,25 @@ class AlignmentSolver { } public: - AlignmentSolver(const MemNode* mem_ref, + AlignmentSolver(const XPointer& xpointer, + const MemNode* mem_ref, const uint vector_length, - const Node* base, - const int offset, - const Node* invar, - const int invar_factor, - const int scale, const Node* init_node, const int pre_stride, const int main_stride DEBUG_ONLY( COMMA const bool is_trace) ) : + _xpointer( xpointer), _mem_ref( mem_ref_not_null(mem_ref)), _vector_length( vector_length), - _element_size( _mem_ref->memory_size()), + _element_size( xpointer.size()), _vector_width( _vector_length * _element_size), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), - _base( base), - _offset( offset), - _invar( invar), - _invar_factor( invar_factor), - _scale( scale), + _base( xpointer.decomposed_form().base().object_or_native()), + _offset( xpointer.con()), + _invar( nullptr), // TODO + _invar_factor( 1), + _scale( xpointer.iv_scale()), _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) From 3f62dc1669f2469c66796089f416341ee611799b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 09:18:28 +0100 Subject: [PATCH 037/130] use XPointer for VMemoryRegion --- src/hotspot/share/opto/mempointer.hpp | 5 ++-- src/hotspot/share/opto/superword.cpp | 12 ++++---- src/hotspot/share/opto/superword.hpp | 5 ---- src/hotspot/share/opto/vtransform.cpp | 42 +++++++++++++++------------ src/hotspot/share/opto/vtransform.hpp | 13 ++++++--- 5 files changed, 41 insertions(+), 36 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 8c7a362e3ac1e..76d9cba7fdbc8 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -666,12 +666,13 @@ class MemPointerDecomposedForm : public StackObj { } } - void print_on(outputStream* st) const { + void print_on(outputStream* st, bool end_with_cr = true) const { st->print("MemPointerDecomposedForm[base: "); _base.print_on(st); st->print(", form: "); print_form_on(st); - st->print_cr("]"); + st->print("]"); + if (end_with_cr) { st->cr(); } } #endif }; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index d0a6407b0f190..af22d4c8cc80c 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -512,7 +512,7 @@ int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { // Opcode RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); - // VPointer summands + // XPointer summands return MemPointerDecomposedForm::cmp_summands(a->xpointer().decomposed_form(), b->xpointer().decomposed_form()); } @@ -522,7 +522,7 @@ int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { int cmp_group = cmp_by_group(a, b); if (cmp_group != 0) { return cmp_group; } - // VPointer con + // XPointer con jint a_con = a->xpointer().decomposed_form().con().value(); jint b_con = b->xpointer().decomposed_form().con().value(); RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); @@ -537,12 +537,12 @@ void SuperWord::create_adjacent_memop_pairs() { collect_valid_memops(memops); - // Sort the MemOps by group, and inside a group by VPointer con: - // - Group: all memops with the same opcode, and the same VPointer summands. Adjacent memops - // have the same opcode and the same VPointer summands, only the VPointer con is + // Sort the MemOps by group, and inside a group by XPointer con: + // - Group: all memops with the same opcode, and the same XPointer summands. Adjacent memops + // have the same opcode and the same XPointer summands, only the XPointer con is // different. Thus, two memops can only be adjacent if they are in the same group. // This decreases the work. - // - VPointer con: Sorting by VPointer con inside the group allows us to perform a sliding + // - XPointer con: Sorting by XPointer con inside the group allows us to perform a sliding // window algorithm, to determine adjacent memops efficiently. memops.sort(MemOp::cmp_by_group_and_con); diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 321f89ea5df70..0807028d6259b 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -56,8 +56,6 @@ // first statement is considered the left element, and the // second statement is considered the right element. -class VPointer; - // The PairSet is a set of pairs. These are later combined to packs, // and stored in the PackSet. class PairSet : public StackObj { @@ -500,9 +498,6 @@ class SuperWord : public ResourceObj { } // VLoopVPointer accessors - const VPointer& vpointer(const MemNode* mem) const { - return _vloop_analyzer.vpointers().vpointer(mem); - } const XPointer& xpointer(const MemNode* mem) const { return _vloop_analyzer.vpointers().xpointer(mem); } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index d09a4c899f685..fd3d414e84284 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -150,10 +150,15 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { if (a > b) { return 1; } // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. -// It represents a memory region: [ptr, ptr + memory_size) +// It represents a memory region: +// [adr, adr + memory_size) +// adr = base + invar + iv_scale * iv + con class VMemoryRegion : public StackObj { private: - Node* _base; // ptr = base + offset + invar + scale * iv + const XPointer* _xpointer; // reference not possible, need empty VMemoryRegion constructor for GrowableArray + + // TODO rm? - maybe also fix printing? + Node* _base; int _scale; Node* _invar; int _offset; @@ -162,14 +167,15 @@ class VMemoryRegion : public StackObj { uint _schedule_order; public: - VMemoryRegion() {} // empty constructor for GrowableArray - VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, uint schedule_order) : - _base(vpointer.base()), - _scale(vpointer.scale_in_bytes()), - _invar(vpointer.invar()), - _offset(vpointer.offset_in_bytes() + _scale * iv_offset), - _memory_size(vpointer.memory_size() * vector_length), - _is_load(vpointer.mem()->is_Load()), + VMemoryRegion() : _xpointer(nullptr) {} // empty constructor for GrowableArray + VMemoryRegion(const XPointer& xpointer, int iv_offset, int vector_length, bool is_load, uint schedule_order) : + _xpointer(&xpointer), + _base( xpointer.decomposed_form().base().object_or_native()), + _scale( xpointer.iv_scale()), + _invar( nullptr), // TODO + _offset( xpointer.con() + _scale * iv_offset), + _memory_size(xpointer.size() * vector_length), + _is_load(is_load), _schedule_order(schedule_order) {} Node* base() const { return _base; } @@ -217,13 +223,10 @@ class VMemoryRegion : public StackObj { #ifndef PRODUCT void print() const { - tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), base", + tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), ", _is_load ? "load " : "store", _memory_size, _schedule_order); - VPointer::print_con_or_idx(_base); - tty->print(" + offset(%4d)", _offset); - tty->print(" + invar"); - VPointer::print_con_or_idx(_invar); - tty->print_cr(" + scale(%4d) * iv]", _scale); + _xpointer->decomposed_form().print_on(tty, false); + tty->print_cr("]"); } #endif }; @@ -350,11 +353,12 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& for (int i = 0; i < _schedule.length(); i++) { VTransformNode* vtn = _schedule.at(i); if (vtn->is_load_or_store_in_loop()) { - const VPointer& p = vtn->vpointer(vloop_analyzer); - if (p.valid()) { + const XPointer& p = vtn->xpointer(vloop_analyzer); + if (p.is_valid()) { VTransformVectorNode* vector = vtn->isa_Vector(); uint vector_length = vector != nullptr ? vector->nodes().length() : 1; - memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, schedule_order++)); + bool is_load = vtn->is_load_in_loop(); + memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, is_load, schedule_order++)); } } } diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 33329b050ce61..8b867e06d8aca 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -320,8 +320,9 @@ class VTransformNode : public ArenaObj { virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; } virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; } + virtual bool is_load_in_loop() const { return false; } virtual bool is_load_or_store_in_loop() const { return false; } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); } + virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const = 0; @@ -345,8 +346,9 @@ class VTransformScalarNode : public VTransformNode { VTransformNode(vtransform, n->req()), _node(n) {} Node* node() const { return _node; } virtual VTransformScalarNode* isa_Scalar() override { return this; } + virtual bool is_load_in_loop() const override { return _node->is_Load(); } virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); } + virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(node()->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };) @@ -361,6 +363,7 @@ class VTransformInputScalarNode : public VTransformScalarNode { VTransformInputScalarNode(VTransform& vtransform, Node* n) : VTransformScalarNode(vtransform, n) {} virtual VTransformInputScalarNode* isa_InputScalar() override { return this; } + virtual bool is_load_in_loop() const override { return false; } virtual bool is_load_or_store_in_loop() const override { return false; } NOT_PRODUCT(virtual const char* name() const override { return "InputScalar"; };) }; @@ -488,8 +491,9 @@ class VTransformLoadVectorNode : public VTransformVectorNode { VTransformVectorNode(vtransform, 3, number_of_nodes) {} LoadNode::ControlDependency control_dependency() const; virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } + virtual bool is_load_in_loop() const override { return true; } virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } + virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) @@ -501,8 +505,9 @@ class VTransformStoreVectorNode : public VTransformVectorNode { VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) : VTransformVectorNode(vtransform, 4, number_of_nodes) {} virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } + virtual bool is_load_in_loop() const override { return false; } virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } + virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) From 1c4496f1459521ec0a63a6b7b42e4e4de38d627e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 09:41:25 +0100 Subject: [PATCH 038/130] move overlap_possible_with_any_in --- src/hotspot/share/opto/vectorization.hpp | 26 +++++++++++------------- src/hotspot/share/opto/vtransform.cpp | 6 ++++-- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index ebb2a724f8ff7..b04c4812ffb00 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -754,6 +754,18 @@ class XPointer : public ArenaObj { bool is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const; bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; + bool overlap_possible_with_any_in(const GrowableArray& nodes, const VLoop& vloop) const { + MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + for (int i = 0; i < nodes.length(); i++) { + MemNode* mem = nodes.at(i)->as_Mem(); + XPointer mem_p(mem->as_Mem(), vloop, empty_callback); + if (!never_overlaps_with(mem_p, vloop)) { + return true; // possible overlap + } + } + return false; + } + NOT_PRODUCT( void print_on(outputStream* st) const; ) private: @@ -932,20 +944,6 @@ class VPointer : public ArenaObj { } } - bool overlap_possible_with_any_in(const GrowableArray& nodes) const { - for (int i = 0; i < nodes.length(); i++) { - MemNode* mem = nodes.at(i)->as_Mem(); - VPointer p_mem(mem, _vloop); - // Only if we know that we have Less or Greater can we - // be sure that there can never be an overlap between - // the two memory regions. - if (!not_equal(p_mem)) { - return true; - } - } - return false; - } - bool not_equal(const VPointer& q) const { return not_equal(cmp(q)); } bool equal(const VPointer& q) const { return equal(cmp(q)); } bool comparable(const VPointer& q) const { return comparable(cmp(q)); } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index fd3d414e84284..4c8743c05fb49 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -580,8 +580,10 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // Walk up the memory chain, and ignore any StoreVector that provably // does not have any memory dependency. while (mem->is_StoreVector()) { - VPointer p_store(mem->as_Mem(), vloop_analyzer.vloop()); - if (p_store.overlap_possible_with_any_in(nodes())) { + // TODO refactor with XPointer for this vector load! + MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + XPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); + if (store_p.overlap_possible_with_any_in(nodes(), vloop_analyzer.vloop())) { break; } else { mem = mem->in(MemNode::Memory); From 0b5302c793a185fb3853a12d2f382e8fbf95945c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 09:47:34 +0100 Subject: [PATCH 039/130] rm VPointer --- src/hotspot/share/opto/vectorization.cpp | 122 +------------------- src/hotspot/share/opto/vectorization.hpp | 139 ----------------------- src/hotspot/share/opto/vtransform.hpp | 3 - 3 files changed, 4 insertions(+), 260 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 266cc0a926d5e..689bff71dbcca 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -30,19 +30,6 @@ #include "opto/rootnode.hpp" #include "opto/vectorization.hpp" -#ifndef PRODUCT -void VPointer::print_con_or_idx(const Node* n) { - if (n == nullptr) { - tty->print("( 0)"); - } else if (n->is_ConI()) { - jint val = n->as_ConI()->get_int(); - tty->print("(%4d)", val); - } else { - tty->print("[%4d]", n->_idx); - } -} -#endif - bool VLoop::check_preconditions() { #ifndef PRODUCT if (is_trace_preconditions()) { @@ -195,8 +182,6 @@ void VLoopVPointers::count_vpointers() { } void VLoopVPointers::allocate_vpointers_array() { - uint bytes = _vpointers_length * sizeof(VPointer); - _vpointers = (VPointer*)_arena->Amalloc(bytes); uint bytes2 = _vpointers_length * sizeof(XPointer); _xpointers = (XPointer*)_arena->Amalloc(bytes2); } @@ -205,8 +190,6 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); - MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? ::new (&_xpointers[pointers_idx]) XPointer(mem, _vloop, empty_callback); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); @@ -214,14 +197,6 @@ void VLoopVPointers::compute_and_cache_vpointers() { }); } -const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const { - assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop"); - int bb_idx = _body.bb_idx(mem); - int pointers_idx = _bb_idx_to_vpointer.at(bb_idx); - assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range"); - return _vpointers[pointers_idx]; -} - const XPointer& VLoopVPointers::xpointer(const MemNode* mem) const { assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop"); int bb_idx = _body.bb_idx(mem); @@ -235,10 +210,6 @@ void VLoopVPointers::print() const { tty->print_cr("\nVLoopVPointers::print:"); _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { - const VPointer& p = vpointer(mem); - tty->print(" "); - p.print(); - const XPointer& xp = xpointer(mem); tty->print(" "); xp.print_on(tty); @@ -422,69 +393,6 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -VPointer::VPointer(MemNode* const mem, const VLoop& vloop, - Node_Stack* nstack, bool analyze_only) : - _mem(mem), _vloop(vloop), - _base(nullptr), _adr(nullptr), _scale(0), _offset(0), _invar(nullptr), -#ifdef ASSERT - _debug_invar(nullptr), _debug_negate_invar(false), _debug_invar_scale(nullptr), -#endif - _nstack(nstack), _analyze_only(analyze_only), _stack_idx(0) -{ - assert(!valid(), "all must be invalid!!!"); -} - -// Biggest detectable factor of the invariant. -int VPointer::invar_factor() const { - Node* n = invar(); - if (n == nullptr) { - return 0; - } - int opc = n->Opcode(); - if (opc == Op_LShiftI && n->in(2)->is_Con()) { - return 1 << n->in(2)->get_int(); - } else if (opc == Op_LShiftL && n->in(2)->is_Con()) { - return 1 << n->in(2)->get_int(); - } - // All our best-effort has failed. - return 1; -} - -// We use two comparisons, because a subtraction could underflow. -#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ - if (a < b) { return -1; } \ - if (a > b) { return 1; } - -// To be in the same group, two VPointers must be the same, -// except for the offset. -int VPointer::cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2) { - const VPointer* a = *p1; - const VPointer* b = *p2; - - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->base()->_idx, b->base()->_idx); - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->scale_in_bytes(), b->scale_in_bytes()); - - int a_inva_idx = a->invar() == nullptr ? 0 : a->invar()->_idx; - int b_inva_idx = b->invar() == nullptr ? 0 : b->invar()->_idx; - RETURN_CMP_VALUE_IF_NOT_EQUAL(a_inva_idx, b_inva_idx); - - return 0; // equal -} - -// We compare by group, then by offset, and finally by node idx. -int VPointer::cmp_for_sort(const VPointer** p1, const VPointer** p2) { - int cmp_group = cmp_for_sort_by_group(p1, p2); - if (cmp_group != 0) { return cmp_group; } - - const VPointer* a = *p1; - const VPointer* b = *p2; - - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->offset_in_bytes(), b->offset_in_bytes()); - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->_idx, b->mem()->_idx); - return 0; // equal -} - bool XPointer::is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const { const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); @@ -558,29 +466,6 @@ void XPointer::print_on(outputStream* st) const { _decomposed_form.print_form_on(st); st->print_cr("]"); } - -// Function for printing the fields of a VPointer -void VPointer::print() const { - tty->print("VPointer[mem: %4d %10s, ", _mem->_idx, _mem->Name()); - - if (!valid()) { - tty->print_cr("invalid]"); - return; - } - - tty->print("base: %4d, ", _base != nullptr ? _base->_idx : 0); - tty->print("adr: %4d, ", _adr != nullptr ? _adr->_idx : 0); - - tty->print(" base"); - VPointer::print_con_or_idx(_base); - - tty->print(" + offset(%4d)", _offset); - - tty->print(" + invar"); - VPointer::print_con_or_idx(_invar); - - tty->print_cr(" + scale(%4d) * iv]", _scale); -} #endif AlignmentSolution* AlignmentSolver::solve() const { @@ -1090,17 +975,18 @@ void AlignmentSolver::trace_start_solve() const { tty->print_cr(" invar_factor = %d", _invar_factor); + // TODO fix up printing // iv = init + pre_iter * pre_stride + main_iter * main_stride tty->print(" iv = init"); - VPointer::print_con_or_idx(_init_node); + //VPointer::print_con_or_idx(_init_node); tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)", _pre_stride, _main_stride); // adr = base + offset + invar + scale * iv tty->print(" adr = base"); - VPointer::print_con_or_idx(_base); + //VPointer::print_con_or_idx(_base); tty->print(" + offset(%d) + invar", _offset); - VPointer::print_con_or_idx(_invar); + //VPointer::print_con_or_idx(_invar); tty->print_cr(" + scale(%d) * iv", _scale); } } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b04c4812ffb00..8056e412a003e 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -34,7 +34,6 @@ // Code in this file and the vectorization.cpp contains shared logics and // utilities for C2's loop auto-vectorization. -class VPointer; class XPointer; class VStatus : public StackObj { @@ -484,7 +483,6 @@ class VLoopVPointers : public StackObj { const VLoopBody& _body; // Array of cached pointers - VPointer* _vpointers; XPointer* _xpointers; int _vpointers_length; @@ -498,7 +496,6 @@ class VLoopVPointers : public StackObj { _arena(arena), _vloop(vloop), _body(body), - _vpointers(nullptr), _xpointers(nullptr), _bb_idx_to_vpointer(arena, vloop.estimated_body_length(), @@ -507,7 +504,6 @@ class VLoopVPointers : public StackObj { NONCOPYABLE(VLoopVPointers); void compute_vpointers(); - const VPointer& vpointer(const MemNode* mem) const; const XPointer& xpointer(const MemNode* mem) const; NOT_PRODUCT( void print() const; ) @@ -826,141 +822,6 @@ class XPointer : public ArenaObj { } }; -// TODO rm -// vpointer = base + con + invar + scale * iv -// -// -class VPointer : public ArenaObj { - protected: - MemNode* const _mem; // My memory reference node - const VLoop& _vloop; - - // Components of the simple form: - Node* _base; // Base address of an array OR null if some off-heap memory. - Node* _adr; // Same as _base if an array pointer OR some off-heap memory pointer. - int _scale; // multiplier for iv (in bytes), 0 if no loop iv - int _offset; // constant offset (in bytes) - - Node* _invar; // invariant offset (in bytes), null if none -#ifdef ASSERT - Node* _debug_invar; - bool _debug_negate_invar; // if true then use: (0 - _invar) - Node* _debug_invar_scale; // multiplier for invariant -#endif - - Node_Stack* _nstack; // stack used to record a vpointer trace of variants - bool _analyze_only; // Used in loop unrolling only for vpointer trace - uint _stack_idx; // Used in loop unrolling only for vpointer trace - - PhaseIdealLoop* phase() const { return _vloop.phase(); } - IdealLoopTree* lpt() const { return _vloop.lpt(); } - PhiNode* iv() const { return _vloop.iv(); } - - public: - enum CMP { - Less = 1, - Greater = 2, - Equal = 4, - NotEqual = (Less | Greater), - NotComparable = (Less | Greater | Equal) - }; - - VPointer(MemNode* const mem, const VLoop& vloop) : - VPointer(mem, vloop, nullptr, false) {} - VPointer(MemNode* const mem, const VLoop& vloop, Node_Stack* nstack) : - VPointer(mem, vloop, nstack, true) {} - private: - VPointer(MemNode* const mem, const VLoop& vloop, - Node_Stack* nstack, bool analyze_only); - NONCOPYABLE(VPointer); - - public: - bool valid() const { return _adr != nullptr; } - bool has_iv() const { return _scale != 0; } - - Node* base() const { return _base; } - Node* adr() const { return _adr; } - MemNode* mem() const { return _mem; } - int scale_in_bytes() const { return _scale; } - Node* invar() const { return _invar; } - int offset_in_bytes() const { return _offset; } - int memory_size() const { return _mem->memory_size(); } - Node_Stack* node_stack() const { return _nstack; } - - // Biggest detectable factor of the invariant. - int invar_factor() const; - - // Comparable? - bool invar_equals(const VPointer& q) const { - assert(_debug_invar == NodeSentinel || q._debug_invar == NodeSentinel || - (_invar == q._invar) == (_debug_invar == q._debug_invar && - _debug_invar_scale == q._debug_invar_scale && - _debug_negate_invar == q._debug_negate_invar), ""); - return _invar == q._invar; - } - - // We compute if and how two VPointers can alias at runtime, i.e. if the two addressed regions of memory can - // ever overlap. There are essentially 3 relevant return states: - // - NotComparable: Synonymous to "unknown aliasing". - // We have no information about how the two VPointers can alias. They could overlap, refer - // to another location in the same memory object, or point to a completely different object. - // -> Memory edge required. Aliasing unlikely but possible. - // - // - Less / Greater: Synonymous to "never aliasing". - // The two VPointers may point into the same memory object, but be non-aliasing (i.e. we - // know both address regions inside the same memory object, but these regions are non- - // overlapping), or the VPointers point to entirely different objects. - // -> No memory edge required. Aliasing impossible. - // - // - Equal: Synonymous to "overlap, or point to different memory objects". - // The two VPointers either overlap on the same memory object, or point to two different - // memory objects. - // -> Memory edge required. Aliasing likely. - // - // In a future refactoring, we can simplify to two states: - // - NeverAlias: instead of Less / Greater - // - MayAlias: instead of Equal / NotComparable - // - // Two VPointer are "comparable" (Less / Greater / Equal), iff all of these conditions apply: - // 1) Both are valid, i.e. expressible in the compound-long-int or simple form. - // 2) The adr are identical, or both are array bases of different arrays. - // 3) They have identical scale. - // 4) They have identical invar. - // 5) The difference in offsets is limited: abs(offset0 - offset1) < 2^31. - int cmp(const VPointer& q) const { - if (valid() && q.valid() && - (_adr == q._adr || (_base == _adr && q._base == q._adr)) && - _scale == q._scale && invar_equals(q)) { - jlong difference = abs(java_subtract((jlong)_offset, (jlong)q._offset)); - jlong max_diff = (jlong)1 << 31; - if (difference >= max_diff) { - return NotComparable; - } - bool overlap = q._offset < _offset + memory_size() && - _offset < q._offset + q.memory_size(); - return overlap ? Equal : (_offset < q._offset ? Less : Greater); - } else { - return NotComparable; - } - } - - bool not_equal(const VPointer& q) const { return not_equal(cmp(q)); } - bool equal(const VPointer& q) const { return equal(cmp(q)); } - bool comparable(const VPointer& q) const { return comparable(cmp(q)); } - static bool not_equal(int cmp) { return cmp <= NotEqual; } - static bool equal(int cmp) { return cmp == Equal; } - static bool comparable(int cmp) { return cmp < NotComparable; } - - // We need to be able to sort the VPointer to efficiently group the - // memops into groups, and to find adjacent memops. - static int cmp_for_sort_by_group(const VPointer** p1, const VPointer** p2); - static int cmp_for_sort(const VPointer** p1, const VPointer** p2); - - NOT_PRODUCT( void print() const; ) - NOT_PRODUCT( static void print_con_or_idx(const Node* n); ) -}; - - // Vector element size statistics for loop vectorization with vector masks class VectorElementSizeStats { private: diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 8b867e06d8aca..d5b071750b4c6 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -236,9 +236,6 @@ class VTransform : public StackObj { int iv_stride() const { return cl()->stride_con(); } // VLoopVPointers accessors - const VPointer& vpointer(const MemNode* mem) const { - return _vloop_analyzer.vpointers().vpointer(mem); - } const XPointer& xpointer(const MemNode* mem) const { return _vloop_analyzer.vpointers().xpointer(mem); } From af148eb54f33fb5c5ec9c91323fdf90ccd566d22 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 09:51:03 +0100 Subject: [PATCH 040/130] XPointer -> VPointer --- src/hotspot/share/opto/superword.cpp | 40 ++++++++++++------------ src/hotspot/share/opto/superword.hpp | 12 +++---- src/hotspot/share/opto/vectorization.cpp | 26 +++++++-------- src/hotspot/share/opto/vectorization.hpp | 40 ++++++++++++------------ src/hotspot/share/opto/vtransform.cpp | 24 +++++++------- src/hotspot/share/opto/vtransform.hpp | 12 +++---- 6 files changed, 77 insertions(+), 77 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index af22d4c8cc80c..10c972684c394 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -49,7 +49,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : { } -// Collect ignored loop nodes during XPointer parsing. +// Collect ignored loop nodes during VPointer parsing. class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerDecomposedFormParser::Callback { private: const VLoop& _vloop; @@ -180,7 +180,7 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa ignored_nodes.set_ignored(adr); } else { // Mark the internal nodes of the address expression in ignored_nodes. - XPointer xp(current, vloop, ignored_nodes); + VPointer xp(current, vloop, ignored_nodes); } } } @@ -512,9 +512,9 @@ int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { // Opcode RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); - // XPointer summands - return MemPointerDecomposedForm::cmp_summands(a->xpointer().decomposed_form(), - b->xpointer().decomposed_form()); + // VPointer summands + return MemPointerDecomposedForm::cmp_summands(a->vpointer().decomposed_form(), + b->vpointer().decomposed_form()); } int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { @@ -522,9 +522,9 @@ int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { int cmp_group = cmp_by_group(a, b); if (cmp_group != 0) { return cmp_group; } - // XPointer con - jint a_con = a->xpointer().decomposed_form().con().value(); - jint b_con = b->xpointer().decomposed_form().con().value(); + // VPointer con + jint a_con = a->vpointer().decomposed_form().con().value(); + jint b_con = b->vpointer().decomposed_form().con().value(); RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); return 0; @@ -537,12 +537,12 @@ void SuperWord::create_adjacent_memop_pairs() { collect_valid_memops(memops); - // Sort the MemOps by group, and inside a group by XPointer con: - // - Group: all memops with the same opcode, and the same XPointer summands. Adjacent memops - // have the same opcode and the same XPointer summands, only the XPointer con is + // Sort the MemOps by group, and inside a group by VPointer con: + // - Group: all memops with the same opcode, and the same VPointer summands. Adjacent memops + // have the same opcode and the same VPointer summands, only the VPointer con is // different. Thus, two memops can only be adjacent if they are in the same group. // This decreases the work. - // - XPointer con: Sorting by XPointer con inside the group allows us to perform a sliding + // - VPointer con: Sorting by VPointer con inside the group allows us to perform a sliding // window algorithm, to determine adjacent memops efficiently. memops.sort(MemOp::cmp_by_group_and_con); @@ -565,7 +565,7 @@ void SuperWord::create_adjacent_memop_pairs() { // Collect all memops that could potentially be vectorized. void SuperWord::collect_valid_memops(GrowableArray& memops) { for_each_mem([&] (MemNode* mem, int bb_idx) { - const XPointer& p = xpointer(mem); + const VPointer& p = vpointer(mem); if (p.is_valid() && !mem->is_LoadStore() && is_java_primitive(mem->memory_type())) { @@ -608,7 +608,7 @@ void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArrayprint(" "); memop.mem()->dump(); tty->print(" "); - memop.xpointer().print_on(tty); + memop.vpointer().print_on(tty); } } #endif @@ -618,13 +618,13 @@ void SuperWord::create_adjacent_memop_pairs_in_one_group(const GrowableArrayas_Mem()); - const XPointer& p2 = xpointer(s2->as_Mem()); + const VPointer& p1 = vpointer(s1->as_Mem()); + const VPointer& p2 = vpointer(s2->as_Mem()); return p1.is_adjacent_to_and_before(p2, _vloop); } @@ -1479,7 +1479,7 @@ const AlignmentSolution* SuperWord::pack_alignment_solution(const Node_List* pac assert(pack != nullptr && (pack->at(0)->is_Load() || pack->at(0)->is_Store()), "only load/store packs"); const MemNode* mem_ref = pack->at(0)->as_Mem(); - const XPointer& mem_ref_p = xpointer(mem_ref); + const VPointer& mem_ref_p = vpointer(mem_ref); const CountedLoopEndNode* pre_end = _vloop.pre_loop_end(); assert(pre_end->stride_is_con(), "pre loop stride is constant"); @@ -2707,7 +2707,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* orig_limit = pre_opaq->original_loop_limit(); assert(orig_limit != nullptr && igvn().type(orig_limit) != Type::TOP, ""); - const XPointer& p = xpointer(align_to_ref); + const VPointer& p = vpointer(align_to_ref); assert(p.is_valid(), "sanity"); // For the main-loop, we want the address of align_to_ref to be memory aligned diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 0807028d6259b..c51e9cb418edb 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -498,8 +498,8 @@ class SuperWord : public ResourceObj { } // VLoopVPointer accessors - const XPointer& xpointer(const MemNode* mem) const { - return _vloop_analyzer.vpointers().xpointer(mem); + const VPointer& vpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().vpointer(mem); } #ifndef PRODUCT @@ -564,15 +564,15 @@ class SuperWord : public ResourceObj { class MemOp : public StackObj { private: MemNode* _mem; - const XPointer* _xpointer; + const VPointer* _vpointer; public: // Empty, for GrowableArray - MemOp() : _mem(nullptr), _xpointer(nullptr) {} - MemOp(MemNode* mem, const XPointer* xpointer) : _mem(mem), _xpointer(xpointer) {} + MemOp() : _mem(nullptr), _vpointer(nullptr) {} + MemOp(MemNode* mem, const VPointer* vpointer) : _mem(mem), _vpointer(vpointer) {} MemNode* mem() const { return _mem; } - const XPointer& xpointer() const { return *_xpointer; } + const VPointer& vpointer() const { return *_vpointer; } static int cmp_by_group(MemOp* a, MemOp* b); static int cmp_by_group_and_con(MemOp* a, MemOp* b); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 689bff71dbcca..a062e8b786248 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -182,8 +182,8 @@ void VLoopVPointers::count_vpointers() { } void VLoopVPointers::allocate_vpointers_array() { - uint bytes2 = _vpointers_length * sizeof(XPointer); - _xpointers = (XPointer*)_arena->Amalloc(bytes2); + uint bytes2 = _vpointers_length * sizeof(VPointer); + _vpointers = (VPointer*)_arena->Amalloc(bytes2); } void VLoopVPointers::compute_and_cache_vpointers() { @@ -191,18 +191,18 @@ void VLoopVPointers::compute_and_cache_vpointers() { _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? - ::new (&_xpointers[pointers_idx]) XPointer(mem, _vloop, empty_callback); + ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, empty_callback); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; }); } -const XPointer& VLoopVPointers::xpointer(const MemNode* mem) const { +const VPointer& VLoopVPointers::vpointer(const MemNode* mem) const { assert(mem != nullptr && _vloop.in_bb(mem), "only mem in loop"); int bb_idx = _body.bb_idx(mem); int pointers_idx = _bb_idx_to_vpointer.at(bb_idx); assert(0 <= pointers_idx && pointers_idx < _vpointers_length, "valid range"); - return _xpointers[pointers_idx]; + return _vpointers[pointers_idx]; } #ifndef PRODUCT @@ -210,7 +210,7 @@ void VLoopVPointers::print() const { tty->print_cr("\nVLoopVPointers::print:"); _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { - const XPointer& xp = xpointer(mem); + const VPointer& xp = vpointer(mem); tty->print(" "); xp.print_on(tty); }); @@ -244,7 +244,7 @@ void VLoopDependencyGraph::construct() { MemNode* n1 = slice_nodes.at(j); memory_pred_edges.clear(); - const XPointer& p1 = _vpointers.xpointer(n1); + const VPointer& p1 = _vpointers.vpointer(n1); // For all memory nodes before it, check if we need to add a memory edge. for (int k = slice_nodes.length() - 1; k > j; k--) { MemNode* n2 = slice_nodes.at(k); @@ -252,7 +252,7 @@ void VLoopDependencyGraph::construct() { // Ignore Load-Load dependencies: if (n1->is_Load() && n2->is_Load()) { continue; } - const XPointer& p2 = _vpointers.xpointer(n2); + const VPointer& p2 = _vpointers.vpointer(n2); if (!p1.never_overlaps_with(p2, _vloop)) { // Possibly overlapping memory memory_pred_edges.append(_body.bb_idx(n2)); @@ -393,7 +393,7 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -bool XPointer::is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const { +bool VPointer::is_adjacent_to_and_before(const VPointer& other, const VLoop& vloop) const { const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); @@ -411,11 +411,11 @@ bool XPointer::is_adjacent_to_and_before(const XPointer& other, const VLoop& vlo return is_adjacent; } -bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) const { +bool VPointer::never_overlaps_with(const VPointer& other, const VLoop& vloop) const { if (!is_valid() || !other.is_valid()) { #ifndef PRODUCT if (vloop.mptrace().is_trace_overlap()) { - tty->print_cr("Never Overlap: false, because of invalid XPointer."); + tty->print_cr("Never Overlap: false, because of invalid VPointer."); } #endif @@ -452,8 +452,8 @@ bool XPointer::never_overlaps_with(const XPointer& other, const VLoop& vloop) co } #ifndef PRODUCT -void XPointer::print_on(outputStream* st) const { - st->print("XPointer["); +void VPointer::print_on(outputStream* st) const { + st->print("VPointer["); if (!is_valid()) { st->print_cr("invalid]"); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 8056e412a003e..d612804cfd488 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -34,7 +34,7 @@ // Code in this file and the vectorization.cpp contains shared logics and // utilities for C2's loop auto-vectorization. -class XPointer; +class VPointer; class VStatus : public StackObj { private: @@ -483,7 +483,7 @@ class VLoopVPointers : public StackObj { const VLoopBody& _body; // Array of cached pointers - XPointer* _xpointers; + VPointer* _vpointers; int _vpointers_length; // Map bb_idx -> index in _vpointers. -1 if not mapped. @@ -496,7 +496,7 @@ class VLoopVPointers : public StackObj { _arena(arena), _vloop(vloop), _body(body), - _xpointers(nullptr), + _vpointers(nullptr), _bb_idx_to_vpointer(arena, vloop.estimated_body_length(), vloop.estimated_body_length(), @@ -504,7 +504,7 @@ class VLoopVPointers : public StackObj { NONCOPYABLE(VLoopVPointers); void compute_vpointers(); - const XPointer& xpointer(const MemNode* mem) const; + const VPointer& vpointer(const MemNode* mem) const; NOT_PRODUCT( void print() const; ) private: @@ -677,7 +677,7 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// XPointer adapts the MemPointerDecomposedForm to the use in a loop: +// VPointer adapts the MemPointerDecomposedForm to the use in a loop: // // pointer = SUM(summands) + con // @@ -700,7 +700,7 @@ class VLoopAnalyzer : StackObj { // pointer does not depend on the iv, since otherwise there would have to be // a summand where its variable it main-loop variant. // -class XPointer : public ArenaObj { +class VPointer : public ArenaObj { private: typedef MemPointerDecomposedFormParser::Callback Callback; @@ -714,14 +714,14 @@ class XPointer : public ArenaObj { public: // Default constructor, e.g. for GrowableArray. - XPointer() : + VPointer() : _decomposed_form(), _size(0), _iv_scale(0), _is_valid(false) {} template - XPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : + VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), _iv_scale(init_iv_scale(_decomposed_form, vloop)), @@ -729,7 +729,7 @@ class XPointer : public ArenaObj { { #ifndef PRODUCT if (vloop.mptrace().is_trace_pointer()) { - tty->print_cr("XPointer::XPointer:"); + tty->print_cr("VPointer::VPointer:"); tty->print("mem: "); mem->dump(); print_on(tty); mem->in(MemNode::Address)->dump_bfs(7, 0, "d"); @@ -747,14 +747,14 @@ class XPointer : public ArenaObj { // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? - bool is_adjacent_to_and_before(const XPointer& other, const VLoop& vloop) const; - bool never_overlaps_with(const XPointer& other, const VLoop& vloop) const; + bool is_adjacent_to_and_before(const VPointer& other, const VLoop& vloop) const; + bool never_overlaps_with(const VPointer& other, const VLoop& vloop) const; bool overlap_possible_with_any_in(const GrowableArray& nodes, const VLoop& vloop) const { MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? for (int i = 0; i < nodes.length(); i++) { MemNode* mem = nodes.at(i)->as_Mem(); - XPointer mem_p(mem->as_Mem(), vloop, empty_callback); + VPointer mem_p(mem->as_Mem(), vloop, empty_callback); if (!never_overlaps_with(mem_p, vloop)) { return true; // possible overlap } @@ -788,7 +788,7 @@ class XPointer : public ArenaObj { // TODO why pre-loop static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { if (!decomposed_form.base().is_known()) { - // XPointer needs to know if it is native (off-heap) or object (on-heap). + // VPointer needs to know if it is native (off-heap) or object (on-heap). // We may for example have failed to fully decompose the MemPointer, possibly // because such a decomposition is not considered safe. return false; @@ -1129,7 +1129,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // a compatible solutions. class AlignmentSolver { private: - const XPointer& _xpointer; + const VPointer& _vpointer; // TODO rm? const MemNode* _mem_ref; // first element @@ -1176,7 +1176,7 @@ class AlignmentSolver { } public: - AlignmentSolver(const XPointer& xpointer, + AlignmentSolver(const VPointer& vpointer, const MemNode* mem_ref, const uint vector_length, const Node* init_node, @@ -1184,17 +1184,17 @@ class AlignmentSolver { const int main_stride DEBUG_ONLY( COMMA const bool is_trace) ) : - _xpointer( xpointer), + _vpointer( vpointer), _mem_ref( mem_ref_not_null(mem_ref)), _vector_length( vector_length), - _element_size( xpointer.size()), + _element_size( vpointer.size()), _vector_width( _vector_length * _element_size), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), - _base( xpointer.decomposed_form().base().object_or_native()), - _offset( xpointer.con()), + _base( vpointer.decomposed_form().base().object_or_native()), + _offset( vpointer.con()), _invar( nullptr), // TODO _invar_factor( 1), - _scale( xpointer.iv_scale()), + _scale( vpointer.iv_scale()), _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 4c8743c05fb49..e23cd17ea6d4a 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -155,7 +155,7 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { // adr = base + invar + iv_scale * iv + con class VMemoryRegion : public StackObj { private: - const XPointer* _xpointer; // reference not possible, need empty VMemoryRegion constructor for GrowableArray + const VPointer* _vpointer; // reference not possible, need empty VMemoryRegion constructor for GrowableArray // TODO rm? - maybe also fix printing? Node* _base; @@ -167,14 +167,14 @@ class VMemoryRegion : public StackObj { uint _schedule_order; public: - VMemoryRegion() : _xpointer(nullptr) {} // empty constructor for GrowableArray - VMemoryRegion(const XPointer& xpointer, int iv_offset, int vector_length, bool is_load, uint schedule_order) : - _xpointer(&xpointer), - _base( xpointer.decomposed_form().base().object_or_native()), - _scale( xpointer.iv_scale()), + VMemoryRegion() : _vpointer(nullptr) {} // empty constructor for GrowableArray + VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, bool is_load, uint schedule_order) : + _vpointer(&vpointer), + _base( vpointer.decomposed_form().base().object_or_native()), + _scale( vpointer.iv_scale()), _invar( nullptr), // TODO - _offset( xpointer.con() + _scale * iv_offset), - _memory_size(xpointer.size() * vector_length), + _offset( vpointer.con() + _scale * iv_offset), + _memory_size(vpointer.size() * vector_length), _is_load(is_load), _schedule_order(schedule_order) {} @@ -225,7 +225,7 @@ class VMemoryRegion : public StackObj { void print() const { tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), ", _is_load ? "load " : "store", _memory_size, _schedule_order); - _xpointer->decomposed_form().print_on(tty, false); + _vpointer->decomposed_form().print_on(tty, false); tty->print_cr("]"); } #endif @@ -353,7 +353,7 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& for (int i = 0; i < _schedule.length(); i++) { VTransformNode* vtn = _schedule.at(i); if (vtn->is_load_or_store_in_loop()) { - const XPointer& p = vtn->xpointer(vloop_analyzer); + const VPointer& p = vtn->vpointer(vloop_analyzer); if (p.is_valid()) { VTransformVectorNode* vector = vtn->isa_Vector(); uint vector_length = vector != nullptr ? vector->nodes().length() : 1; @@ -580,9 +580,9 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // Walk up the memory chain, and ignore any StoreVector that provably // does not have any memory dependency. while (mem->is_StoreVector()) { - // TODO refactor with XPointer for this vector load! + // TODO refactor with VPointer for this vector load! MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? - XPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); + VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); if (store_p.overlap_possible_with_any_in(nodes(), vloop_analyzer.vloop())) { break; } else { diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index d5b071750b4c6..9090f68483394 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -236,8 +236,8 @@ class VTransform : public StackObj { int iv_stride() const { return cl()->stride_con(); } // VLoopVPointers accessors - const XPointer& xpointer(const MemNode* mem) const { - return _vloop_analyzer.vpointers().xpointer(mem); + const VPointer& vpointer(const MemNode* mem) const { + return _vloop_analyzer.vpointers().vpointer(mem); } // Ensure that the main loop vectors are aligned by adjusting the pre loop limit. @@ -319,7 +319,7 @@ class VTransformNode : public ArenaObj { virtual bool is_load_in_loop() const { return false; } virtual bool is_load_or_store_in_loop() const { return false; } - virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const { ShouldNotReachHere(); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const = 0; @@ -345,7 +345,7 @@ class VTransformScalarNode : public VTransformNode { virtual VTransformScalarNode* isa_Scalar() override { return this; } virtual bool is_load_in_loop() const override { return _node->is_Load(); } virtual bool is_load_or_store_in_loop() const override { return _node->is_Load() || _node->is_Store(); } - virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(node()->as_Mem()); } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(node()->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "Scalar"; };) @@ -490,7 +490,7 @@ class VTransformLoadVectorNode : public VTransformVectorNode { virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } virtual bool is_load_in_loop() const override { return true; } virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(nodes().at(0)->as_Mem()); } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) @@ -504,7 +504,7 @@ class VTransformStoreVectorNode : public VTransformVectorNode { virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } virtual bool is_load_in_loop() const override { return false; } virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const XPointer& xpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().xpointer(nodes().at(0)->as_Mem()); } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) From a834c07f7655f3da5ee175db795cbe511425be1d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:11:30 +0100 Subject: [PATCH 041/130] rename offset -> con --- src/hotspot/share/opto/vectorization.cpp | 58 ++++++++++++------------ src/hotspot/share/opto/vectorization.hpp | 20 ++++---- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index a062e8b786248..95a8ca9ef8341 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -489,7 +489,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // // The Simple form of the address is disassembled by VPointer into: // - // adr = base + offset + invar + scale * iv + // adr = base + invar + scale * iv + con // // Where the iv can be written as: // @@ -508,11 +508,11 @@ AlignmentSolution* AlignmentSolver::solve() const { // Simple form Expansion of iv variable Reshaped with constants Comments for terms // ----------- ------------------------ ----------------------- ------------------ // adr = base = base = base (base % aw = 0) - // + offset + offset + C_const (sum of constant terms) // + invar + invar_factor * var_invar + C_invar * var_invar (term for invariant) // / + scale * init + C_init * var_init (term for variable init) // + scale * iv -> | + scale * pre_stride * pre_iter + C_pre * pre_iter (adjustable pre-loop term) // \ + scale * main_stride * main_iter + C_main * main_iter (main-loop term) + // + con + con + C_const (sum of constant terms) // // We describe the 6 terms: // 1) The "base" of the address is the address of a Java object (e.g. array), @@ -522,7 +522,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 // - // 2) The "C_const" term is the sum of all constant terms. This is "offset", + // 2) The "C_const" term is the sum of all constant terms. This is "con", // plus "scale * init" if it is constant. // 3) The "C_invar * var_invar" is the factorization of "invar" into a constant // and variable term. If there is no invariant, then "C_invar" is zero. @@ -547,7 +547,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // Attribute init (i.e. _init_node) either to C_const or to C_init term. const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0; - const int C_const = _offset + C_const_init * _scale; + const int C_const = _con + C_const_init * _scale; // Set C_invar depending on if invar is present const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor); @@ -906,36 +906,36 @@ AlignmentSolution* AlignmentSolver::solve() const { // adr % aw = // // -> Simple form - // (base + offset + invar + scale * iv) % aw = + // (base + invar + scale * iv + con) % aw = // // -> Expand iv - // (base + offset + invar + scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw = + // (base + con + invar + scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw = // // -> Reshape - // (base + offset + invar + // (base + con + invar // + scale * init // + scale * pre_stride * pre_iter // + scale * main_stride * main_iter)) % aw = // // -> base aligned: base % aw = 0 // -> main-loop iterations aligned (2): C_main % aw = (scale * main_stride) % aw = 0 - // (offset + invar + scale * init + scale * pre_stride * pre_iter) % aw = + // (con + invar + scale * init + scale * pre_stride * pre_iter) % aw = // // -> apply (12) - // (offset + invar + scale * init - // + scale * pre_stride * (m * q - C_const / (scale * pre_stride) - // [- invar / (scale * pre_stride) ] - // [- init / pre_stride ] - // ) + // (con + invar + scale * init + // + scale * pre_stride * (m * q - C_const / (scale * pre_stride) + // [- invar / (scale * pre_stride) ] + // [- init / pre_stride ] + // ) // ) % aw = // - // -> expand C_const = offset [+ init * scale] (if init const) - // (offset + invar + scale * init - // + scale * pre_stride * (m * q - offset / (scale * pre_stride) - // [- init / pre_stride ] (if init constant) - // [- invar / (scale * pre_stride) ] (if invar present) - // [- init / pre_stride ] (if init variable) - // ) + // -> expand C_const = con [+ init * scale] (if init const) + // (con + invar + scale * init + // + scale * pre_stride * (m * q - con / (scale * pre_stride) + // [- init / pre_stride ] (if init constant) + // [- invar / (scale * pre_stride) ] (if invar present) + // [- init / pre_stride ] (if init variable) + // ) // ) % aw = // // -> assuming invar = 0 if it is not present @@ -943,11 +943,11 @@ AlignmentSolution* AlignmentSolver::solve() const { // -> apply (8): q = aw / (abs(C_pre)) = aw / abs(scale * pre_stride) // -> and hence: (scale * pre_stride * q) % aw = 0 // -> all terms are canceled out - // (offset + invar + scale * init - // + scale * pre_stride * m * q -> aw aligned - // - scale * pre_stride * offset / (scale * pre_stride) -> = offset - // - scale * pre_stride * init / pre_stride -> = scale * init - // - scale * pre_stride * invar / (scale * pre_stride) -> = invar + // (con + invar + scale * init + // + scale * pre_stride * m * q -> aw aligned + // - scale * pre_stride * con / (scale * pre_stride) -> = con + // - scale * pre_stride * init / pre_stride -> = scale * init + // - scale * pre_stride * invar / (scale * pre_stride) -> = invar // ) % aw = 0 // // The solution given by (12) does indeed guarantee alignment. @@ -982,10 +982,10 @@ void AlignmentSolver::trace_start_solve() const { tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)", _pre_stride, _main_stride); - // adr = base + offset + invar + scale * iv + // adr = base + con + invar + scale * iv tty->print(" adr = base"); //VPointer::print_con_or_idx(_base); - tty->print(" + offset(%d) + invar", _offset); + tty->print(" + con(%d) + invar", _con); //VPointer::print_con_or_idx(_invar); tty->print_cr(" + scale(%d) * iv", _scale); } @@ -1018,8 +1018,8 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, tty->print_cr(" no invariant:"); tty->print_cr(" C_invar = %d", C_invar); } - tty->print_cr(" C_const = offset(%d) + scale(%d) * C_const_init(%d) = %d", - _offset, _scale, C_const_init, C_const); + tty->print_cr(" C_const = con(%d) + scale(%d) * C_const_init(%d) = %d", + _con, _scale, C_const_init, C_const); tty->print_cr(" C_pre = scale(%d) * pre_stride(%d) = %d", _scale, _pre_stride, C_pre); tty->print_cr(" C_main = scale(%d) * main_stride(%d) = %d", diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index d612804cfd488..2eda953b56273 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -870,9 +870,9 @@ class VectorElementSizeStats { // When alignment is required, we must adjust the pre-loop iteration count pre_iter, // such that the address is aligned for any main_iter >= 0: // -// adr = base + offset + invar + scale * init -// + scale * pre_stride * pre_iter -// + scale * main_stride * main_iter +// adr = base + invar + scale * init + con +// + scale * pre_stride * pre_iter +// + scale * main_stride * main_iter // // The AlignmentSolver generates solutions of the following forms: // 1. Empty: No pre_iter guarantees alignment. @@ -1102,8 +1102,8 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // // pre-loop: // iv = init + i * pre_stride -// adr = base + offset + invar + scale * iv -// adr = base + offset + invar + scale * (init + i * pre_stride) +// adr = base + invar + scale * iv + con +// adr = base + invar + scale * (init + i * pre_stride) + con // iv += pre_stride // i++ // @@ -1117,7 +1117,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // i = pre_iter + main_iter * unroll_factor // iv = init + i * pre_stride = init + pre_iter * pre_stride + main_iter * unroll_factor * pre_stride // = init + pre_iter * pre_stride + main_iter * main_stride -// adr = base + offset + invar + scale * iv // must be aligned +// adr = base + invar + scale * iv + con // must be aligned // iv += main_stride // i += unroll_factor // main_iter++ @@ -1140,7 +1140,7 @@ class AlignmentSolver { // All vector loads and stores need to be memory aligned. The alignment width (aw) in // principle is the vector_width. But when vector_width > ObjectAlignmentInBytes this is // too strict, since any memory object is only guaranteed to be ObjectAlignmentInBytes - // aligned. For example, the relative offset between two arrays is only guaranteed to + // aligned. For example, the relative distance between two arrays is only guaranteed to // be divisible by ObjectAlignmentInBytes. const int _aw; @@ -1150,7 +1150,7 @@ class AlignmentSolver { // // The Simple form of the address is disassembled by VPointer into: // - // adr = base + offset + invar + scale * iv + // adr = base + invar + scale * iv + con // // Where the iv can be written as: // @@ -1160,7 +1160,7 @@ class AlignmentSolver { // main_iter: number of main-loop iterations (main_iter >= 0) // const Node* _base; // base of address (e.g. Java array object, aw-aligned) - const int _offset; + const int _con; const Node* _invar; const int _invar_factor; // known constant factor of invar const int _scale; @@ -1191,7 +1191,7 @@ class AlignmentSolver { _vector_width( _vector_length * _element_size), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), _base( vpointer.decomposed_form().base().object_or_native()), - _offset( vpointer.con()), + _con( vpointer.con()), _invar( nullptr), // TODO _invar_factor( 1), _scale( vpointer.iv_scale()), From 0e59a74a85be34a5719e586d3582d31b86bf0074 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:17:13 +0100 Subject: [PATCH 042/130] rm con field --- src/hotspot/share/opto/vectorization.cpp | 6 +++--- src/hotspot/share/opto/vectorization.hpp | 2 -- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 95a8ca9ef8341..e31607059aa5e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -547,7 +547,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // Attribute init (i.e. _init_node) either to C_const or to C_init term. const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0; - const int C_const = _con + C_const_init * _scale; + const int C_const = _vpointer.con() + C_const_init * _scale; // Set C_invar depending on if invar is present const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor); @@ -985,7 +985,7 @@ void AlignmentSolver::trace_start_solve() const { // adr = base + con + invar + scale * iv tty->print(" adr = base"); //VPointer::print_con_or_idx(_base); - tty->print(" + con(%d) + invar", _con); + tty->print(" + con(%d) + invar", _vpointer.con()); //VPointer::print_con_or_idx(_invar); tty->print_cr(" + scale(%d) * iv", _scale); } @@ -1019,7 +1019,7 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, tty->print_cr(" C_invar = %d", C_invar); } tty->print_cr(" C_const = con(%d) + scale(%d) * C_const_init(%d) = %d", - _con, _scale, C_const_init, C_const); + _vpointer.con(), _scale, C_const_init, C_const); tty->print_cr(" C_pre = scale(%d) * pre_stride(%d) = %d", _scale, _pre_stride, C_pre); tty->print_cr(" C_main = scale(%d) * main_stride(%d) = %d", diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 2eda953b56273..cc9471b5b3b1d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1160,7 +1160,6 @@ class AlignmentSolver { // main_iter: number of main-loop iterations (main_iter >= 0) // const Node* _base; // base of address (e.g. Java array object, aw-aligned) - const int _con; const Node* _invar; const int _invar_factor; // known constant factor of invar const int _scale; @@ -1191,7 +1190,6 @@ class AlignmentSolver { _vector_width( _vector_length * _element_size), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), _base( vpointer.decomposed_form().base().object_or_native()), - _con( vpointer.con()), _invar( nullptr), // TODO _invar_factor( 1), _scale( vpointer.iv_scale()), From 179bcb429e87ed45d337c712433763c7bcabc555 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:28:42 +0100 Subject: [PATCH 043/130] rename scale -> iv_scale --- src/hotspot/share/opto/vectorization.cpp | 158 +++++++++++------------ src/hotspot/share/opto/vectorization.hpp | 46 +++---- 2 files changed, 102 insertions(+), 102 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index e31607059aa5e..8cc2be4c5b99e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -478,9 +478,9 @@ AlignmentSolution* AlignmentSolver::solve() const { assert(is_power_of_2(abs(_main_stride)), "main_stride is power of 2"); assert(_aw > 0 && is_power_of_2(_aw), "aw must be power of 2"); - // Out of simplicity: non power-of-2 scale not supported. - if (abs(_scale) == 0 || !is_power_of_2(abs(_scale))) { - return new EmptyAlignmentSolution("non power-of-2 scale not supported"); + // Out of simplicity: non power-of-2 iv_scale not supported. + if (abs(_iv_scale) == 0 || !is_power_of_2(abs(_iv_scale))) { + return new EmptyAlignmentSolution("non power-of-2 iv_scale not supported"); } // We analyze the address of mem_ref. The idea is to disassemble it into a linear @@ -489,7 +489,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // // The Simple form of the address is disassembled by VPointer into: // - // adr = base + invar + scale * iv + con + // adr = base + invar + iv_scale * iv + con // // Where the iv can be written as: // @@ -505,14 +505,14 @@ AlignmentSolution* AlignmentSolver::solve() const { // expanding the iv variable. In a second step, we reshape the expression again, and // state it as a linear expression, consisting of 6 terms. // - // Simple form Expansion of iv variable Reshaped with constants Comments for terms - // ----------- ------------------------ ----------------------- ------------------ - // adr = base = base = base (base % aw = 0) - // + invar + invar_factor * var_invar + C_invar * var_invar (term for invariant) - // / + scale * init + C_init * var_init (term for variable init) - // + scale * iv -> | + scale * pre_stride * pre_iter + C_pre * pre_iter (adjustable pre-loop term) - // \ + scale * main_stride * main_iter + C_main * main_iter (main-loop term) - // + con + con + C_const (sum of constant terms) + // Simple form Expansion of iv variable Reshaped with constants Comments for terms + // ----------- ------------------------ ----------------------- ------------------ + // adr = base = base = base (base % aw = 0) + // + invar + invar_factor * var_invar + C_invar * var_invar (term for invariant) + // / + iv_scale * init + C_init * var_init (term for variable init) + // + iv_scale * iv -> | + iv_scale * pre_stride * pre_iter + C_pre * pre_iter (adjustable pre-loop term) + // \ + iv_scale * main_stride * main_iter + C_main * main_iter (main-loop term) + // + con + con + C_const (sum of constant terms) // // We describe the 6 terms: // 1) The "base" of the address is the address of a Java object (e.g. array), @@ -523,18 +523,18 @@ AlignmentSolution* AlignmentSolver::solve() const { // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 // // 2) The "C_const" term is the sum of all constant terms. This is "con", - // plus "scale * init" if it is constant. + // plus "iv_scale * init" if it is constant. // 3) The "C_invar * var_invar" is the factorization of "invar" into a constant // and variable term. If there is no invariant, then "C_invar" is zero. // // invar = C_invar * var_invar (FAC_INVAR) // - // 4) The "C_init * var_init" is the factorization of "scale * init" into a + // 4) The "C_init * var_init" is the factorization of "iv_scale * init" into a // constant and a variable term. If "init" is constant, then "C_init" is // zero, and "C_const" accounts for "init" instead. // - // scale * init = C_init * var_init + scale * C_const_init (FAC_INIT) - // C_init = (init is constant) ? 0 : scale + // iv_scale * init = C_init * var_init + iv_scale * C_const_init (FAC_INIT) + // C_init = (init is constant) ? 0 : iv_scale // C_const_init = (init is constant) ? init : 0 // // 5) The "C_pre * pre_iter" term represents how much the iv is incremented @@ -547,14 +547,14 @@ AlignmentSolution* AlignmentSolver::solve() const { // Attribute init (i.e. _init_node) either to C_const or to C_init term. const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0; - const int C_const = _vpointer.con() + C_const_init * _scale; + const int C_const = _vpointer.con() + C_const_init * _iv_scale; // Set C_invar depending on if invar is present const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor); - const int C_init = _init_node->is_ConI() ? 0 : _scale; - const int C_pre = _scale * _pre_stride; - const int C_main = _scale * _main_stride; + const int C_init = _init_node->is_ConI() ? 0 : _iv_scale; + const int C_pre = _iv_scale * _pre_stride; + const int C_main = _iv_scale * _main_stride; DEBUG_ONLY( trace_reshaped_form(C_const, C_const_init, C_invar, C_init, C_pre, C_main); ) @@ -839,7 +839,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // pre_iter_C_const = mx2 * q - sign(C_pre) * X // = mx2 * q - sign(C_pre) * C_const / abs(C_pre) // = mx2 * q - C_const / C_pre - // = mx2 * q - C_const / (scale * pre_stride) (11a) + // = mx2 * q - C_const / (iv_scale * pre_stride) (11a) // // If there is an invariant: // @@ -847,19 +847,19 @@ AlignmentSolution* AlignmentSolver::solve() const { // = my2 * q - sign(C_pre) * C_invar * var_invar / abs(C_pre) // = my2 * q - sign(C_pre) * invar / abs(C_pre) // = my2 * q - invar / C_pre - // = my2 * q - invar / (scale * pre_stride) (11b, with invar) + // = my2 * q - invar / (iv_scale * pre_stride) (11b, with invar) // // If there is no invariant (i.e. C_invar = 0 ==> Y = 0): // // pre_iter_C_invar = my2 * q (11b, no invar) // - // If init is variable (i.e. C_init = scale, init = var_init): + // If init is variable (i.e. C_init = iv_scale, init = var_init): // - // pre_iter_C_init = mz2 * q - sign(C_pre) * Z * var_init - // = mz2 * q - sign(C_pre) * C_init * var_init / abs(C_pre) - // = mz2 * q - sign(C_pre) * scale * init / abs(C_pre) - // = mz2 * q - scale * init / C_pre - // = mz2 * q - scale * init / (scale * pre_stride) + // pre_iter_C_init = mz2 * q - sign(C_pre) * Z * var_init + // = mz2 * q - sign(C_pre) * C_init * var_init / abs(C_pre) + // = mz2 * q - sign(C_pre) * iv_scale * init / abs(C_pre) + // = mz2 * q - iv_scale * init / C_pre + // = mz2 * q - iv_scale * init / (iv_scale * pre_stride) // = mz2 * q - init / pre_stride (11c, variable init) // // If init is constant (i.e. C_init = 0 ==> Z = 0): @@ -870,35 +870,35 @@ AlignmentSolution* AlignmentSolver::solve() const { // with m = mx2 + my2 + mz2: // // pre_iter = pre_iter_C_const + pre_iter_C_invar + pre_iter_C_init - // = mx2 * q - C_const / (scale * pre_stride) - // + my2 * q [- invar / (scale * pre_stride) ] - // + mz2 * q [- init / pre_stride ] + // = mx2 * q - C_const / (iv_scale * pre_stride) + // + my2 * q [- invar / (iv_scale * pre_stride) ] + // + mz2 * q [- init / pre_stride ] // // = m * q (periodic part) - // - C_const / (scale * pre_stride) (align constant term) - // [- invar / (scale * pre_stride) ] (align invariant term, if present) - // [- init / pre_stride ] (align variable init term, if present) (12) + // - C_const / (iv_scale * pre_stride) (align constant term) + // [- invar / (iv_scale * pre_stride) ] (align invariant term, if present) + // [- init / pre_stride ] (align variable init term, if present) (12) // // We can further simplify this solution by introducing integer 0 <= r < q: // - // r = (-C_const / (scale * pre_stride)) % q (13) + // r = (-C_const / (iv_scale * pre_stride)) % q (13) // - const int r = AlignmentSolution::mod(-C_const / (_scale * _pre_stride), q); + const int r = AlignmentSolution::mod(-C_const / (_iv_scale * _pre_stride), q); // // pre_iter = m * q + r - // [- invar / (scale * pre_stride) ] - // [- init / pre_stride ] (14) + // [- invar / (iv_scale * pre_stride) ] + // [- init / pre_stride ] (14) // // We thus get a solution that can be stated in terms of: // - // q (periodicity), r (constant alignment), invar, scale, pre_stride, init + // q (periodicity), r (constant alignment), invar, iv_scale, pre_stride, init // // However, pre_stride and init are shared by all mem_ref in the loop, hence we do not need to provide // them in the solution description. DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); ) - return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, _scale); + return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, _iv_scale); // APPENDIX: // We can now verify the success of the solution given by (12): @@ -906,48 +906,48 @@ AlignmentSolution* AlignmentSolver::solve() const { // adr % aw = // // -> Simple form - // (base + invar + scale * iv + con) % aw = + // (base + invar + iv_scale * iv + con) % aw = // // -> Expand iv - // (base + con + invar + scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw = + // (base + con + invar + iv_scale * (init + pre_stride * pre_iter + main_stride * main_iter)) % aw = // // -> Reshape // (base + con + invar - // + scale * init - // + scale * pre_stride * pre_iter - // + scale * main_stride * main_iter)) % aw = + // + iv_scale * init + // + iv_scale * pre_stride * pre_iter + // + iv_scale * main_stride * main_iter)) % aw = // // -> base aligned: base % aw = 0 - // -> main-loop iterations aligned (2): C_main % aw = (scale * main_stride) % aw = 0 - // (con + invar + scale * init + scale * pre_stride * pre_iter) % aw = + // -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0 + // (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw = // // -> apply (12) - // (con + invar + scale * init - // + scale * pre_stride * (m * q - C_const / (scale * pre_stride) - // [- invar / (scale * pre_stride) ] - // [- init / pre_stride ] - // ) + // (con + invar + iv_scale * init + // + iv_scale * pre_stride * (m * q - C_const / (iv_scale * pre_stride) + // [- invar / (iv_scale * pre_stride) ] + // [- init / pre_stride ] + // ) // ) % aw = // - // -> expand C_const = con [+ init * scale] (if init const) - // (con + invar + scale * init - // + scale * pre_stride * (m * q - con / (scale * pre_stride) - // [- init / pre_stride ] (if init constant) - // [- invar / (scale * pre_stride) ] (if invar present) - // [- init / pre_stride ] (if init variable) - // ) + // -> expand C_const = con [+ init * iv_scale] (if init const) + // (con + invar + iv_scale * init + // + iv_scale * pre_stride * (m * q - con / (iv_scale * pre_stride) + // [- init / pre_stride ] (if init constant) + // [- invar / (iv_scale * pre_stride) ] (if invar present) + // [- init / pre_stride ] (if init variable) + // ) // ) % aw = // // -> assuming invar = 0 if it is not present // -> merge the two init terms (variable or constant) - // -> apply (8): q = aw / (abs(C_pre)) = aw / abs(scale * pre_stride) - // -> and hence: (scale * pre_stride * q) % aw = 0 + // -> apply (8): q = aw / (abs(C_pre)) = aw / abs(iv_scale * pre_stride) + // -> and hence: (iv_scale * pre_stride * q) % aw = 0 // -> all terms are canceled out - // (con + invar + scale * init - // + scale * pre_stride * m * q -> aw aligned - // - scale * pre_stride * con / (scale * pre_stride) -> = con - // - scale * pre_stride * init / pre_stride -> = scale * init - // - scale * pre_stride * invar / (scale * pre_stride) -> = invar + // (con + invar + iv_scale * init + // + iv_scale * pre_stride * m * q -> aw aligned + // - iv_scale * pre_stride * con / (iv_scale * pre_stride) -> = con + // - iv_scale * pre_stride * init / pre_stride -> = iv_scale * init + // - iv_scale * pre_stride * invar / (iv_scale * pre_stride) -> = invar // ) % aw = 0 // // The solution given by (12) does indeed guarantee alignment. @@ -982,12 +982,12 @@ void AlignmentSolver::trace_start_solve() const { tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)", _pre_stride, _main_stride); - // adr = base + con + invar + scale * iv + // adr = base + con + invar + iv_scale * iv tty->print(" adr = base"); //VPointer::print_con_or_idx(_base); tty->print(" + con(%d) + invar", _vpointer.con()); //VPointer::print_con_or_idx(_invar); - tty->print_cr(" + scale(%d) * iv", _scale); + tty->print_cr(" + iv_scale(%d) * iv", _iv_scale); } } @@ -1009,7 +1009,7 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, } else { tty->print_cr(" init is variable:"); tty->print_cr(" C_const_init = %d", C_const_init); - tty->print_cr(" C_init = abs(scale)= %d", C_init); + tty->print_cr(" C_init = abs(iv_scale)= %d", C_init); } if (_invar != nullptr) { tty->print_cr(" invariant present:"); @@ -1018,12 +1018,12 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, tty->print_cr(" no invariant:"); tty->print_cr(" C_invar = %d", C_invar); } - tty->print_cr(" C_const = con(%d) + scale(%d) * C_const_init(%d) = %d", - _vpointer.con(), _scale, C_const_init, C_const); - tty->print_cr(" C_pre = scale(%d) * pre_stride(%d) = %d", - _scale, _pre_stride, C_pre); - tty->print_cr(" C_main = scale(%d) * main_stride(%d) = %d", - _scale, _main_stride, C_main); + tty->print_cr(" C_const = con(%d) + iv_scale(%d) * C_const_init(%d) = %d", + _vpointer.con(), _iv_scale, C_const_init, C_const); + tty->print_cr(" C_pre = iv_scale(%d) * pre_stride(%d) = %d", + _iv_scale, _pre_stride, C_pre); + tty->print_cr(" C_main = iv_scale(%d) * main_stride(%d) = %d", + _iv_scale, _main_stride, C_main); } } @@ -1092,13 +1092,13 @@ void AlignmentSolver::trace_constrained_solution(const int C_const, tty->print_cr(" EQ(10b): pre_iter_C_invar = my2 * q(%d) - sign(C_pre) * Y(%d) * var_invar", q, Y); tty->print_cr(" EQ(10c): pre_iter_C_init = mz2 * q(%d) - sign(C_pre) * Z(%d) * var_init ", q, Z); - tty->print_cr(" r = (-C_const(%d) / (scale(%d) * pre_stride(%d)) %% q(%d) = %d", - C_const, _scale, _pre_stride, q, r); + tty->print_cr(" r = (-C_const(%d) / (iv_scale(%d) * pre_stride(%d)) %% q(%d) = %d", + C_const, _iv_scale, _pre_stride, q, r); tty->print_cr(" EQ(14): pre_iter = m * q(%3d) - r(%d)", q, r); if (_invar != nullptr) { - tty->print_cr(" - invar / (scale(%d) * pre_stride(%d))", - _scale, _pre_stride); + tty->print_cr(" - invar / (iv_scale(%d) * pre_stride(%d))", + _iv_scale, _pre_stride); } if (!_init_node->is_ConI()) { tty->print_cr(" - init / pre_stride(%d)", diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index cc9471b5b3b1d..65cace70fc9fe 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -870,9 +870,9 @@ class VectorElementSizeStats { // When alignment is required, we must adjust the pre-loop iteration count pre_iter, // such that the address is aligned for any main_iter >= 0: // -// adr = base + invar + scale * init + con -// + scale * pre_stride * pre_iter -// + scale * main_stride * main_iter +// adr = base + invar + iv_scale * init + con +// + iv_scale * pre_stride * pre_iter +// + iv_scale * main_stride * main_iter // // The AlignmentSolver generates solutions of the following forms: // 1. Empty: No pre_iter guarantees alignment. @@ -881,9 +881,9 @@ class VectorElementSizeStats { // // The Constrained solution is of the following form: // -// pre_iter = m * q + r (for any integer m) -// [- invar / (scale * pre_stride) ] (if there is an invariant) -// [- init / pre_stride ] (if init is variable) +// pre_iter = m * q + r (for any integer m) +// [- invar / (iv_scale * pre_stride) ] (if there is an invariant) +// [- init / pre_stride ] (if init is variable) // // The solution is periodic with periodicity q, which is guaranteed to be a power of 2. // This periodic solution is "rotated" by three alignment terms: one for constants (r), @@ -977,18 +977,18 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const int _q; const int _r; const Node* _invar; - const int _scale; + const int _iv_scale; public: ConstrainedAlignmentSolution(const MemNode* mem_ref, const int q, const int r, const Node* invar, - int scale) : + int iv_scale) : _mem_ref(mem_ref), _q(q), _r(r), _invar(invar), - _scale(scale) { + _iv_scale(iv_scale) { assert(q > 1 && is_power_of_2(q), "q must be power of 2"); assert(0 <= r && r < q, "r must be in modulo space of q"); assert(_mem_ref != nullptr, "must have mem_ref"); @@ -1022,12 +1022,12 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // for any integers m1 and m2: // // pre_iter = m1 * q1 + r1 - // [- invar1 / (scale1 * pre_stride) ] - // [- init / pre_stride ] + // [- invar1 / (iv_scale1 * pre_stride) ] + // [- init / pre_stride ] // // pre_iter = m2 * q2 + r2 - // [- invar2 / (scale2 * pre_stride) ] - // [- init / pre_stride ] + // [- invar2 / (iv_scale2 * pre_stride) ] + // [- init / pre_stride ] // // Note: pre_stride and init are identical for all mem_refs in the loop. // @@ -1036,13 +1036,13 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // // The invar alignment term is identical if either: // - both mem_refs have no invariant. - // - both mem_refs have the same invariant and the same scale. + // - both mem_refs have the same invariant and the same iv_scale. // if (s1->_invar != s2->_invar) { return new EmptyAlignmentSolution("invar not identical"); } - if (s1->_invar != nullptr && s1->_scale != s2->_scale) { - return new EmptyAlignmentSolution("has invar with different scale"); + if (s1->_invar != nullptr && s1->_iv_scale != s2->_iv_scale) { + return new EmptyAlignmentSolution("has invar with different iv_scale"); } // Now, we have reduced the problem to: @@ -1084,7 +1084,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { virtual void print() const override final { tty->print("m * q(%d) + r(%d)", _q, _r); if (_invar != nullptr) { - tty->print(" - invar[%d] / (scale(%d) * pre_stride)", _invar->_idx, _scale); + tty->print(" - invar[%d] / (iv_scale(%d) * pre_stride)", _invar->_idx, _iv_scale); } tty->print_cr(" [- init / pre_stride], mem_ref[%d]", mem_ref()->_idx); }; @@ -1102,8 +1102,8 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // // pre-loop: // iv = init + i * pre_stride -// adr = base + invar + scale * iv + con -// adr = base + invar + scale * (init + i * pre_stride) + con +// adr = base + invar + iv_scale * iv + con +// adr = base + invar + iv_scale * (init + i * pre_stride) + con // iv += pre_stride // i++ // @@ -1117,7 +1117,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // i = pre_iter + main_iter * unroll_factor // iv = init + i * pre_stride = init + pre_iter * pre_stride + main_iter * unroll_factor * pre_stride // = init + pre_iter * pre_stride + main_iter * main_stride -// adr = base + invar + scale * iv + con // must be aligned +// adr = base + invar + iv_scale * iv + con // must be aligned // iv += main_stride // i += unroll_factor // main_iter++ @@ -1150,7 +1150,7 @@ class AlignmentSolver { // // The Simple form of the address is disassembled by VPointer into: // - // adr = base + invar + scale * iv + con + // adr = base + invar + iv_scale * iv + con // // Where the iv can be written as: // @@ -1162,7 +1162,7 @@ class AlignmentSolver { const Node* _base; // base of address (e.g. Java array object, aw-aligned) const Node* _invar; const int _invar_factor; // known constant factor of invar - const int _scale; + const int _iv_scale; const Node* _init_node; // value of iv before pre-loop const int _pre_stride; // address increment per pre-loop iteration const int _main_stride; // address increment per main-loop iteration @@ -1192,7 +1192,7 @@ class AlignmentSolver { _base( vpointer.decomposed_form().base().object_or_native()), _invar( nullptr), // TODO _invar_factor( 1), - _scale( vpointer.iv_scale()), + _iv_scale( vpointer.iv_scale()), _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) From 7ec9cc736eddb613c9c229b5c32c8a6e0ff34583 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:36:41 +0100 Subject: [PATCH 044/130] replace iv_scale field with delegation --- src/hotspot/share/opto/vectorization.cpp | 26 ++++++++++++------------ src/hotspot/share/opto/vectorization.hpp | 8 ++++---- 2 files changed, 17 insertions(+), 17 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 8cc2be4c5b99e..6a716862cbe12 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -479,7 +479,7 @@ AlignmentSolution* AlignmentSolver::solve() const { assert(_aw > 0 && is_power_of_2(_aw), "aw must be power of 2"); // Out of simplicity: non power-of-2 iv_scale not supported. - if (abs(_iv_scale) == 0 || !is_power_of_2(abs(_iv_scale))) { + if (abs(iv_scale()) == 0 || !is_power_of_2(abs(iv_scale()))) { return new EmptyAlignmentSolution("non power-of-2 iv_scale not supported"); } @@ -547,14 +547,14 @@ AlignmentSolution* AlignmentSolver::solve() const { // Attribute init (i.e. _init_node) either to C_const or to C_init term. const int C_const_init = _init_node->is_ConI() ? _init_node->as_ConI()->get_int() : 0; - const int C_const = _vpointer.con() + C_const_init * _iv_scale; + const int C_const = _vpointer.con() + C_const_init * iv_scale(); // Set C_invar depending on if invar is present const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor); - const int C_init = _init_node->is_ConI() ? 0 : _iv_scale; - const int C_pre = _iv_scale * _pre_stride; - const int C_main = _iv_scale * _main_stride; + const int C_init = _init_node->is_ConI() ? 0 : iv_scale(); + const int C_pre = iv_scale() * _pre_stride; + const int C_main = iv_scale() * _main_stride; DEBUG_ONLY( trace_reshaped_form(C_const, C_const_init, C_invar, C_init, C_pre, C_main); ) @@ -883,7 +883,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // // r = (-C_const / (iv_scale * pre_stride)) % q (13) // - const int r = AlignmentSolution::mod(-C_const / (_iv_scale * _pre_stride), q); + const int r = AlignmentSolution::mod(-C_const / (iv_scale() * _pre_stride), q); // // pre_iter = m * q + r // [- invar / (iv_scale * pre_stride) ] @@ -898,7 +898,7 @@ AlignmentSolution* AlignmentSolver::solve() const { DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); ) - return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, _iv_scale); + return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, iv_scale()); // APPENDIX: // We can now verify the success of the solution given by (12): @@ -987,7 +987,7 @@ void AlignmentSolver::trace_start_solve() const { //VPointer::print_con_or_idx(_base); tty->print(" + con(%d) + invar", _vpointer.con()); //VPointer::print_con_or_idx(_invar); - tty->print_cr(" + iv_scale(%d) * iv", _iv_scale); + tty->print_cr(" + iv_scale(%d) * iv", iv_scale()); } } @@ -1019,11 +1019,11 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, tty->print_cr(" C_invar = %d", C_invar); } tty->print_cr(" C_const = con(%d) + iv_scale(%d) * C_const_init(%d) = %d", - _vpointer.con(), _iv_scale, C_const_init, C_const); + _vpointer.con(), iv_scale(), C_const_init, C_const); tty->print_cr(" C_pre = iv_scale(%d) * pre_stride(%d) = %d", - _iv_scale, _pre_stride, C_pre); + iv_scale(), _pre_stride, C_pre); tty->print_cr(" C_main = iv_scale(%d) * main_stride(%d) = %d", - _iv_scale, _main_stride, C_main); + iv_scale(), _main_stride, C_main); } } @@ -1093,12 +1093,12 @@ void AlignmentSolver::trace_constrained_solution(const int C_const, tty->print_cr(" EQ(10c): pre_iter_C_init = mz2 * q(%d) - sign(C_pre) * Z(%d) * var_init ", q, Z); tty->print_cr(" r = (-C_const(%d) / (iv_scale(%d) * pre_stride(%d)) %% q(%d) = %d", - C_const, _iv_scale, _pre_stride, q, r); + C_const, iv_scale(), _pre_stride, q, r); tty->print_cr(" EQ(14): pre_iter = m * q(%3d) - r(%d)", q, r); if (_invar != nullptr) { tty->print_cr(" - invar / (iv_scale(%d) * pre_stride(%d))", - _iv_scale, _pre_stride); + iv_scale(), _pre_stride); } if (!_init_node->is_ConI()) { tty->print_cr(" - init / pre_stride(%d)", diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 65cace70fc9fe..a2122e9bf1bc6 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -977,13 +977,13 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const int _q; const int _r; const Node* _invar; - const int _iv_scale; + const jint _iv_scale; public: ConstrainedAlignmentSolution(const MemNode* mem_ref, const int q, const int r, const Node* invar, - int iv_scale) : + const jint iv_scale) : _mem_ref(mem_ref), _q(q), _r(r), @@ -1162,7 +1162,6 @@ class AlignmentSolver { const Node* _base; // base of address (e.g. Java array object, aw-aligned) const Node* _invar; const int _invar_factor; // known constant factor of invar - const int _iv_scale; const Node* _init_node; // value of iv before pre-loop const int _pre_stride; // address increment per pre-loop iteration const int _main_stride; // address increment per main-loop iteration @@ -1192,7 +1191,6 @@ class AlignmentSolver { _base( vpointer.decomposed_form().base().object_or_native()), _invar( nullptr), // TODO _invar_factor( 1), - _iv_scale( vpointer.iv_scale()), _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) @@ -1206,6 +1204,8 @@ class AlignmentSolver { AlignmentSolution* solve() const; private: + jint iv_scale() const { return _vpointer.iv_scale(); } + class EQ4 { private: const int _C_const; From e9bffd64259be269dabaf7a04bac9cb8a00f5fea Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:48:31 +0100 Subject: [PATCH 045/130] replace base field with delegation --- src/hotspot/share/opto/vectorization.cpp | 12 ++++++++---- src/hotspot/share/opto/vectorization.hpp | 3 +-- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 6a716862cbe12..a1a5b29eba858 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -507,7 +507,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // // Simple form Expansion of iv variable Reshaped with constants Comments for terms // ----------- ------------------------ ----------------------- ------------------ - // adr = base = base = base (base % aw = 0) + // adr = base = base = base (assume: base % aw = 0) // + invar + invar_factor * var_invar + C_invar * var_invar (term for invariant) // / + iv_scale * init + C_init * var_init (term for variable init) // + iv_scale * iv -> | + iv_scale * pre_stride * pre_iter + C_pre * pre_iter (adjustable pre-loop term) @@ -522,6 +522,9 @@ AlignmentSolution* AlignmentSolver::solve() const { // // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 // + // Note: we have been assuming that this also holds for native memory base + // addresses. This is incorrect, see JDK-8323582. + // // 2) The "C_const" term is the sum of all constant terms. This is "con", // plus "iv_scale * init" if it is constant. // 3) The "C_invar * var_invar" is the factorization of "invar" into a constant @@ -561,6 +564,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the // modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q). // + // Note: the following assumption is incorrect for native memory bases, see JDK-8323582. // Since "base % aw = 0", we only need to ensure alignment of the other 5 terms: // // (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1) @@ -918,6 +922,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // + iv_scale * main_stride * main_iter)) % aw = // // -> base aligned: base % aw = 0 + // Note: this assumption is incorrect for native memory bases, see JDK-8323582. // -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0 // (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw = // @@ -983,8 +988,7 @@ void AlignmentSolver::trace_start_solve() const { _pre_stride, _main_stride); // adr = base + con + invar + iv_scale * iv - tty->print(" adr = base"); - //VPointer::print_con_or_idx(_base); + tty->print(" adr = base[%d]", base().object_or_native()->_idx); tty->print(" + con(%d) + invar", _vpointer.con()); //VPointer::print_con_or_idx(_invar); tty->print_cr(" + iv_scale(%d) * iv", iv_scale()); @@ -999,7 +1003,7 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, const int C_main) const { if (is_trace()) { - tty->print(" = base[%d] + ", _base->_idx); + tty->print(" = base[%d] + ", base().object_or_native()->_idx); tty->print_cr("C_const(%d) + C_invar(%d) * var_invar + C_init(%d) * var_init + C_pre(%d) * pre_iter + C_main(%d) * main_iter", C_const, C_invar, C_init, C_pre, C_main); if (_init_node->is_ConI()) { diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index a2122e9bf1bc6..798dcd4642921 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1159,7 +1159,6 @@ class AlignmentSolver { // pre_iter: number of pre-loop iterations (adjustable via pre-loop limit) // main_iter: number of main-loop iterations (main_iter >= 0) // - const Node* _base; // base of address (e.g. Java array object, aw-aligned) const Node* _invar; const int _invar_factor; // known constant factor of invar const Node* _init_node; // value of iv before pre-loop @@ -1188,7 +1187,6 @@ class AlignmentSolver { _element_size( vpointer.size()), _vector_width( _vector_length * _element_size), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), - _base( vpointer.decomposed_form().base().object_or_native()), _invar( nullptr), // TODO _invar_factor( 1), _init_node( init_node), @@ -1204,6 +1202,7 @@ class AlignmentSolver { AlignmentSolution* solve() const; private: + MemPointerDecomposedForm::Base base() const { return _vpointer.decomposed_form().base();} jint iv_scale() const { return _vpointer.iv_scale(); } class EQ4 { From 9e7eac0c3a05a9b828e23f50564cdb64dbf86c28 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 10:58:19 +0100 Subject: [PATCH 046/130] rn 2 more fields --- src/hotspot/share/opto/vectorization.cpp | 3 +-- src/hotspot/share/opto/vectorization.hpp | 6 +----- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index a1a5b29eba858..f20fa039915be 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -963,8 +963,7 @@ void AlignmentSolver::trace_start_solve() const { if (is_trace()) { tty->print(" vector mem_ref:"); _mem_ref->dump(); - tty->print_cr(" vector_width = vector_length(%d) * element_size(%d) = %d", - _vector_length, _element_size, _vector_width); + tty->print_cr(" vector_width = %d", _vector_width); tty->print_cr(" aw = alignment_width = min(vector_width(%d), ObjectAlignmentInBytes(%d)) = %d", _vector_width, ObjectAlignmentInBytes, _aw); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 798dcd4642921..cf7edcefe4055 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1133,8 +1133,6 @@ class AlignmentSolver { // TODO rm? const MemNode* _mem_ref; // first element - const uint _vector_length; // number of elements in vector - const int _element_size; const int _vector_width; // in bytes // All vector loads and stores need to be memory aligned. The alignment width (aw) in @@ -1183,9 +1181,7 @@ class AlignmentSolver { ) : _vpointer( vpointer), _mem_ref( mem_ref_not_null(mem_ref)), - _vector_length( vector_length), - _element_size( vpointer.size()), - _vector_width( _vector_length * _element_size), + _vector_width( vector_length * vpointer.size()), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), _invar( nullptr), // TODO _invar_factor( 1), From 85774ccd03216a81a9b34e4d3f33abb8270ad2d8 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 11:23:44 +0100 Subject: [PATCH 047/130] for_each_invar_summand first version --- src/hotspot/share/opto/mempointer.hpp | 10 ++++++++++ src/hotspot/share/opto/vectorization.cpp | 5 +++++ src/hotspot/share/opto/vectorization.hpp | 10 +++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 76d9cba7fdbc8..50417ad8f483d 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -650,6 +650,16 @@ class MemPointerDecomposedForm : public StackObj { return 0; } + template + void for_each_non_empty_summand(Callback callback) const { + for (int i = 0; i < SUMMANDS_SIZE; i++) { + const MemPointerSummand& s = summands_at(i); + if (s.variable() != nullptr) { + callback(s); + } + } + } + #ifndef PRODUCT void print_form_on(outputStream* st) const { if (_con.is_NaN()) { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index f20fa039915be..12979ec0eaa8b 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -976,6 +976,11 @@ void AlignmentSolver::trace_start_solve() const { tty->print(" invar:"); _invar->dump(); } + tty->print_cr("invar_summands:"); + // TODO and check about base / iv + // _vpointer.for_each_invar_summand([&] (const MemPointerSummand& s) { + // s.print_on(tty); + // }, ); tty->print_cr(" invar_factor = %d", _invar_factor); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index cf7edcefe4055..4d56fa6ba83eb 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -743,7 +743,15 @@ class VPointer : public ArenaObj { jint size() const { assert(_is_valid, ""); return _size; } jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } jint con() const { return decomposed_form().con().value(); } - // TODO for each in invar_summands - maybe make it static so we can use it during init? + + template + void for_each_invar_summand(Callback callback, const VLoop& vloop) const { + decomposed_form().for_each_non_empty_summand([&] (const MemPointerSummand& s) { + if (is_invariant(s.variable(), vloop)) { + callback(s); + } + }); + } // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? From a27eee6fc7593a74de18ac121f8e40f9a0250b7c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 12:10:32 +0100 Subject: [PATCH 048/130] add vloop to vpointer, and some printing --- src/hotspot/share/opto/superword.cpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 42 +++++++++++++++--------- src/hotspot/share/opto/vectorization.hpp | 29 ++++++++-------- src/hotspot/share/opto/vtransform.cpp | 2 +- 4 files changed, 41 insertions(+), 34 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 10c972684c394..640a50ce50321 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -780,7 +780,7 @@ bool SuperWord::are_adjacent_refs(Node* s1, Node* s2) const { const VPointer& p1 = vpointer(s1->as_Mem()); const VPointer& p2 = vpointer(s2->as_Mem()); - return p1.is_adjacent_to_and_before(p2, _vloop); + return p1.is_adjacent_to_and_before(p2); } //------------------------------isomorphic--------------------------- diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 12979ec0eaa8b..c589b2d677b0f 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -253,7 +253,7 @@ void VLoopDependencyGraph::construct() { if (n1->is_Load() && n2->is_Load()) { continue; } const VPointer& p2 = _vpointers.vpointer(n2); - if (!p1.never_overlaps_with(p2, _vloop)) { + if (!p1.never_overlaps_with(p2)) { // Possibly overlapping memory memory_pred_edges.append(_body.bb_idx(n2)); } @@ -393,14 +393,14 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -bool VPointer::is_adjacent_to_and_before(const VPointer& other, const VLoop& vloop) const { +bool VPointer::is_adjacent_to_and_before(const VPointer& other) const { const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); - const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); + const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _vloop.mptrace() )); const bool is_adjacent = aliasing.is_always_at_distance(_size); #ifndef PRODUCT - if (vloop.mptrace().is_trace_adjacency()) { + if (_vloop.mptrace().is_trace_adjacency()) { tty->print("Adjacent: %s, because size = %d and aliasing = ", is_adjacent ? "true" : "false", _size); aliasing.print_on(tty); @@ -411,10 +411,10 @@ bool VPointer::is_adjacent_to_and_before(const VPointer& other, const VLoop& vlo return is_adjacent; } -bool VPointer::never_overlaps_with(const VPointer& other, const VLoop& vloop) const { +bool VPointer::never_overlaps_with(const VPointer& other) const { if (!is_valid() || !other.is_valid()) { #ifndef PRODUCT - if (vloop.mptrace().is_trace_overlap()) { + if (_vloop.mptrace().is_trace_overlap()) { tty->print_cr("Never Overlap: false, because of invalid VPointer."); } #endif @@ -424,7 +424,7 @@ bool VPointer::never_overlaps_with(const VPointer& other, const VLoop& vloop) co const MemPointerDecomposedForm& s1 = decomposed_form(); const MemPointerDecomposedForm& s2 = other.decomposed_form(); - const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA vloop.mptrace() )); + const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _vloop.mptrace() )); // The aliasing tries to compute: // distance = s2 - s1 @@ -440,7 +440,7 @@ bool VPointer::never_overlaps_with(const VPointer& other, const VLoop& vloop) co bool is_never_overlap = aliasing.is_never_in_distance_range(distance_lo, distance_hi); #ifndef PRODUCT - if (vloop.mptrace().is_trace_overlap()) { + if (_vloop.mptrace().is_trace_overlap()) { tty->print("Never Overlap: %s, distance_lo: %d, distance_hi: %d, aliasing: ", is_never_overlap ? "true" : "false", distance_lo, distance_hi); aliasing.print_on(tty); @@ -464,6 +464,11 @@ void VPointer::print_on(outputStream* st) const { _decomposed_form.base().print_on(st); st->print(", form: "); _decomposed_form.print_form_on(st); + st->print(", invar_summands: "); + for_each_invar_summand([&] (const MemPointerSummand& s) { + s.print_on(tty); + st->print(","); + }); st->print_cr("]"); } #endif @@ -963,6 +968,8 @@ void AlignmentSolver::trace_start_solve() const { if (is_trace()) { tty->print(" vector mem_ref:"); _mem_ref->dump(); + tty->print(" VPointer: "); + _vpointer.print_on(tty); tty->print_cr(" vector_width = %d", _vector_width); tty->print_cr(" aw = alignment_width = min(vector_width(%d), ObjectAlignmentInBytes(%d)) = %d", _vector_width, ObjectAlignmentInBytes, _aw); @@ -972,15 +979,18 @@ void AlignmentSolver::trace_start_solve() const { _init_node->dump(); } - if (_invar != nullptr) { - tty->print(" invar:"); - _invar->dump(); + tty->print_cr(" invar_summands:"); + int invar_count = 0; + _vpointer.for_each_invar_summand([&] (const MemPointerSummand& s) { + tty->print(" "); + s.print_on(tty); + tty->print(" -> "); + s.variable()->dump(); + invar_count++; + }); + if (invar_count == 0) { + tty->print(" No invar_summands."); } - tty->print_cr("invar_summands:"); - // TODO and check about base / iv - // _vpointer.for_each_invar_summand([&] (const MemPointerSummand& s) { - // s.print_on(tty); - // }, ); tty->print_cr(" invar_factor = %d", _invar_factor); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 4d56fa6ba83eb..0dbaa82fc8ed5 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -704,6 +704,7 @@ class VPointer : public ArenaObj { private: typedef MemPointerDecomposedFormParser::Callback Callback; + const VLoop& _vloop; const MemPointerDecomposedForm _decomposed_form; const jint _size; @@ -713,19 +714,13 @@ class VPointer : public ArenaObj { const bool _is_valid; // TODO any accessor should assert if not valid! public: - // Default constructor, e.g. for GrowableArray. - VPointer() : - _decomposed_form(), - _size(0), - _iv_scale(0), - _is_valid(false) {} - template VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : + _vloop(vloop), _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), - _iv_scale(init_iv_scale(_decomposed_form, vloop)), - _is_valid(init_is_valid(_decomposed_form, vloop)) + _iv_scale(init_iv_scale(_decomposed_form, _vloop)), + _is_valid(init_is_valid(_decomposed_form, _vloop)) { #ifndef PRODUCT if (vloop.mptrace().is_trace_pointer()) { @@ -745,9 +740,11 @@ class VPointer : public ArenaObj { jint con() const { return decomposed_form().con().value(); } template - void for_each_invar_summand(Callback callback, const VLoop& vloop) const { + void for_each_invar_summand(Callback callback) const { decomposed_form().for_each_non_empty_summand([&] (const MemPointerSummand& s) { - if (is_invariant(s.variable(), vloop)) { + Node* variable = s.variable(); + if (variable != decomposed_form().base().object_or_native() && + is_invariant(variable, _vloop)) { callback(s); } }); @@ -755,15 +752,15 @@ class VPointer : public ArenaObj { // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? - bool is_adjacent_to_and_before(const VPointer& other, const VLoop& vloop) const; - bool never_overlaps_with(const VPointer& other, const VLoop& vloop) const; + bool is_adjacent_to_and_before(const VPointer& other) const; + bool never_overlaps_with(const VPointer& other) const; - bool overlap_possible_with_any_in(const GrowableArray& nodes, const VLoop& vloop) const { + bool overlap_possible_with_any_in(const GrowableArray& nodes) const { MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? for (int i = 0; i < nodes.length(); i++) { MemNode* mem = nodes.at(i)->as_Mem(); - VPointer mem_p(mem->as_Mem(), vloop, empty_callback); - if (!never_overlaps_with(mem_p, vloop)) { + VPointer mem_p(mem->as_Mem(), _vloop, empty_callback); + if (!never_overlaps_with(mem_p)) { return true; // possible overlap } } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index e23cd17ea6d4a..0fe9004b7368c 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -583,7 +583,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // TODO refactor with VPointer for this vector load! MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); - if (store_p.overlap_possible_with_any_in(nodes(), vloop_analyzer.vloop())) { + if (store_p.overlap_possible_with_any_in(nodes())) { break; } else { mem = mem->in(MemNode::Memory); From 00b442d56d60989e10badc4acb359fd722d7bad0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 12:57:40 +0100 Subject: [PATCH 049/130] invar_factor --- src/hotspot/share/opto/vectorization.cpp | 8 +++++--- src/hotspot/share/opto/vectorization.hpp | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index c589b2d677b0f..5a2bd70dcbef2 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -558,7 +558,8 @@ AlignmentSolution* AlignmentSolver::solve() const { const int C_const = _vpointer.con() + C_const_init * iv_scale(); // Set C_invar depending on if invar is present - const int C_invar = (_invar == nullptr) ? 0 : abs(_invar_factor); + const jint invar_factor = _vpointer.compute_invar_factor(); + const int C_invar = (_invar == nullptr) ? 0 : invar_factor; const int C_init = _init_node->is_ConI() ? 0 : iv_scale(); const int C_pre = iv_scale() * _pre_stride; @@ -992,7 +993,8 @@ void AlignmentSolver::trace_start_solve() const { tty->print(" No invar_summands."); } - tty->print_cr(" invar_factor = %d", _invar_factor); + const jint invar_factor = _vpointer.compute_invar_factor(); + tty->print_cr(" invar_factor = %d", invar_factor); // TODO fix up printing // iv = init + pre_iter * pre_stride + main_iter * main_stride @@ -1031,7 +1033,7 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, } if (_invar != nullptr) { tty->print_cr(" invariant present:"); - tty->print_cr(" C_invar = abs(invar_factor) = %d", C_invar); + tty->print_cr(" C_invar = invar_factor = %d", C_invar); } else { tty->print_cr(" no invariant:"); tty->print_cr(" C_invar = %d", C_invar); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 0dbaa82fc8ed5..c2f5d9e105d18 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -750,6 +750,19 @@ class VPointer : public ArenaObj { }); } + // Greatest common factor among the scales of the invar_summands. + // Out of simplicity, we only factor out positive powers-of-2, + // between 1 and ObjectAlignmentInBytes. + jint compute_invar_factor() const { + jint factor = ObjectAlignmentInBytes; + for_each_invar_summand([&] (const MemPointerSummand& s) { + while (!s.scale().is_multiple_of(NoOverflowInt(factor))) { + factor = factor / 2; + } + }); + return factor; + } + // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? bool is_adjacent_to_and_before(const VPointer& other) const; @@ -1163,7 +1176,6 @@ class AlignmentSolver { // main_iter: number of main-loop iterations (main_iter >= 0) // const Node* _invar; - const int _invar_factor; // known constant factor of invar const Node* _init_node; // value of iv before pre-loop const int _pre_stride; // address increment per pre-loop iteration const int _main_stride; // address increment per main-loop iteration @@ -1189,7 +1201,6 @@ class AlignmentSolver { _vector_width( vector_length * vpointer.size()), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), _invar( nullptr), // TODO - _invar_factor( 1), _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) From 296455461c933981a818a1c791ea610f2b7b5c60 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 13:09:51 +0100 Subject: [PATCH 050/130] trace_start_solve --- src/hotspot/share/opto/vectorization.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 5a2bd70dcbef2..030989b2ed5f7 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -980,7 +980,7 @@ void AlignmentSolver::trace_start_solve() const { _init_node->dump(); } - tty->print_cr(" invar_summands:"); + tty->print_cr(" invar = SUM(invar_summands), invar_summands:"); int invar_count = 0; _vpointer.for_each_invar_summand([&] (const MemPointerSummand& s) { tty->print(" "); @@ -990,24 +990,24 @@ void AlignmentSolver::trace_start_solve() const { invar_count++; }); if (invar_count == 0) { - tty->print(" No invar_summands."); + tty->print_cr(" No invar_summands."); } const jint invar_factor = _vpointer.compute_invar_factor(); tty->print_cr(" invar_factor = %d", invar_factor); - // TODO fix up printing // iv = init + pre_iter * pre_stride + main_iter * main_stride tty->print(" iv = init"); - //VPointer::print_con_or_idx(_init_node); + if (_init_node->is_ConI()) { + tty->print("(%4d)", _init_node->as_ConI()->get_int()); + } else { + tty->print("[%4d]", _init_node->_idx); + } tty->print_cr(" + pre_iter * pre_stride(%d) + main_iter * main_stride(%d)", _pre_stride, _main_stride); - // adr = base + con + invar + iv_scale * iv tty->print(" adr = base[%d]", base().object_or_native()->_idx); - tty->print(" + con(%d) + invar", _vpointer.con()); - //VPointer::print_con_or_idx(_invar); - tty->print_cr(" + iv_scale(%d) * iv", iv_scale()); + tty->print(" + con(%d) + invar + iv_scale(%d) * iv", _vpointer.con(), iv_scale()); } } From 18461de85dccf354ed045485adb827d627a828eb Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 14:40:32 +0100 Subject: [PATCH 051/130] rm _invar field --- src/hotspot/share/opto/vectorization.cpp | 9 ++++----- src/hotspot/share/opto/vectorization.hpp | 9 +++++---- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 030989b2ed5f7..3ca6225fdc8cd 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -558,8 +558,7 @@ AlignmentSolution* AlignmentSolver::solve() const { const int C_const = _vpointer.con() + C_const_init * iv_scale(); // Set C_invar depending on if invar is present - const jint invar_factor = _vpointer.compute_invar_factor(); - const int C_invar = (_invar == nullptr) ? 0 : invar_factor; + const int C_invar = _vpointer.compute_invar_factor(); const int C_init = _init_node->is_ConI() ? 0 : iv_scale(); const int C_pre = iv_scale() * _pre_stride; @@ -908,7 +907,7 @@ AlignmentSolution* AlignmentSolver::solve() const { DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); ) - return new ConstrainedAlignmentSolution(_mem_ref, q, r, _invar, iv_scale()); + return new ConstrainedAlignmentSolution(_mem_ref, q, r, nullptr /*TODO*/, iv_scale()); // APPENDIX: // We can now verify the success of the solution given by (12): @@ -1031,7 +1030,7 @@ void AlignmentSolver::trace_reshaped_form(const int C_const, tty->print_cr(" C_const_init = %d", C_const_init); tty->print_cr(" C_init = abs(iv_scale)= %d", C_init); } - if (_invar != nullptr) { + if (C_invar != 0) { tty->print_cr(" invariant present:"); tty->print_cr(" C_invar = invar_factor = %d", C_invar); } else { @@ -1116,7 +1115,7 @@ void AlignmentSolver::trace_constrained_solution(const int C_const, C_const, iv_scale(), _pre_stride, q, r); tty->print_cr(" EQ(14): pre_iter = m * q(%3d) - r(%d)", q, r); - if (_invar != nullptr) { + if (C_invar != 0) { tty->print_cr(" - invar / (iv_scale(%d) * pre_stride(%d))", iv_scale(), _pre_stride); } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index c2f5d9e105d18..9de11a87de481 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -752,15 +752,18 @@ class VPointer : public ArenaObj { // Greatest common factor among the scales of the invar_summands. // Out of simplicity, we only factor out positive powers-of-2, - // between 1 and ObjectAlignmentInBytes. + // between 1 and ObjectAlignmentInBytes. If the invar is empty, + // i.e. there is no summand in invar_summands, we return 0. jint compute_invar_factor() const { jint factor = ObjectAlignmentInBytes; + int invar_count = 0; for_each_invar_summand([&] (const MemPointerSummand& s) { + invar_count++; while (!s.scale().is_multiple_of(NoOverflowInt(factor))) { factor = factor / 2; } }); - return factor; + return invar_count > 0 ? factor : 0; } // Aliasing @@ -1175,7 +1178,6 @@ class AlignmentSolver { // pre_iter: number of pre-loop iterations (adjustable via pre-loop limit) // main_iter: number of main-loop iterations (main_iter >= 0) // - const Node* _invar; const Node* _init_node; // value of iv before pre-loop const int _pre_stride; // address increment per pre-loop iteration const int _main_stride; // address increment per main-loop iteration @@ -1200,7 +1202,6 @@ class AlignmentSolver { _mem_ref( mem_ref_not_null(mem_ref)), _vector_width( vector_length * vpointer.size()), _aw( MIN2(_vector_width, ObjectAlignmentInBytes)), - _invar( nullptr), // TODO _init_node( init_node), _pre_stride( pre_stride), _main_stride( main_stride) From 40e39a08de140238a1ffdcccd949ed2337558598 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 14:51:29 +0100 Subject: [PATCH 052/130] pass vpointer for invar and iv_scale --- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 11 +++++++---- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 3ca6225fdc8cd..f3c81c960fb8e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -907,7 +907,7 @@ AlignmentSolution* AlignmentSolver::solve() const { DEBUG_ONLY( trace_constrained_solution(C_const, C_invar, C_init, C_pre, q, r); ) - return new ConstrainedAlignmentSolution(_mem_ref, q, r, nullptr /*TODO*/, iv_scale()); + return new ConstrainedAlignmentSolution(_mem_ref, q, r, _vpointer /* holds invar and iv_scale */); // APPENDIX: // We can now verify the success of the solution given by (12): diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 9de11a87de481..718ba59d0d0cf 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -997,19 +997,22 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const MemNode* _mem_ref; const int _q; const int _r; + // Use VPointer for invar and iv_scale + const VPointer& _vpointer; + // TODO use VPointer for invar and iv_scale const Node* _invar; const jint _iv_scale; public: ConstrainedAlignmentSolution(const MemNode* mem_ref, const int q, const int r, - const Node* invar, - const jint iv_scale) : + const VPointer& vpointer) : _mem_ref(mem_ref), _q(q), _r(r), - _invar(invar), - _iv_scale(iv_scale) { + _vpointer(vpointer), + _invar(nullptr), + _iv_scale(vpointer.iv_scale()) { assert(q > 1 && is_power_of_2(q), "q must be power of 2"); assert(0 <= r && r < q, "r must be in modulo space of q"); assert(_mem_ref != nullptr, "must have mem_ref"); From 4302768d4b86276d86e088756a7594d0ff6c838f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 22 Nov 2024 14:56:16 +0100 Subject: [PATCH 053/130] rm iv_scale field, add assert for TODO --- src/hotspot/share/opto/vectorization.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 718ba59d0d0cf..a5181b21a6e42 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1001,7 +1001,6 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const VPointer& _vpointer; // TODO use VPointer for invar and iv_scale const Node* _invar; - const jint _iv_scale; public: ConstrainedAlignmentSolution(const MemNode* mem_ref, const int q, @@ -1011,8 +1010,8 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { _q(q), _r(r), _vpointer(vpointer), - _invar(nullptr), - _iv_scale(vpointer.iv_scale()) { + _invar(nullptr) + { assert(q > 1 && is_power_of_2(q), "q must be power of 2"); assert(0 <= r && r < q, "r must be in modulo space of q"); assert(_mem_ref != nullptr, "must have mem_ref"); @@ -1062,10 +1061,11 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // - both mem_refs have no invariant. // - both mem_refs have the same invariant and the same iv_scale. // + assert(false, "fix invar check"); if (s1->_invar != s2->_invar) { return new EmptyAlignmentSolution("invar not identical"); } - if (s1->_invar != nullptr && s1->_iv_scale != s2->_iv_scale) { + if (s1->_invar != nullptr && s1->_vpointer.iv_scale() != s2->_vpointer.iv_scale()) { return new EmptyAlignmentSolution("has invar with different iv_scale"); } @@ -1108,7 +1108,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { virtual void print() const override final { tty->print("m * q(%d) + r(%d)", _q, _r); if (_invar != nullptr) { - tty->print(" - invar[%d] / (iv_scale(%d) * pre_stride)", _invar->_idx, _iv_scale); + tty->print(" - invar[%d] / (iv_scale(%d) * pre_stride)", _invar->_idx, _vpointer.iv_scale()); } tty->print_cr(" [- init / pre_stride], mem_ref[%d]", mem_ref()->_idx); }; From cf78fd22c20c3b66d8c83c1f5f3a534b36d90d64 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 25 Nov 2024 15:10:06 +0100 Subject: [PATCH 054/130] fix invar check --- src/hotspot/share/opto/mempointer.hpp | 11 ++++++++++- src/hotspot/share/opto/vectorization.hpp | 25 ++++++++++++++++++------ 2 files changed, 29 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 50417ad8f483d..91fc1b5416ce3 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -628,10 +628,19 @@ class MemPointerDecomposedForm : public StackObj { private: bool has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const; - bool has_same_summands_as(const MemPointerDecomposedForm& other) const { return has_same_summands_as(other, 0); }; + bool has_same_summands_as(const MemPointerDecomposedForm& other) const { return has_same_summands_as(other, 0); } bool has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const; public: + bool has_same_non_base_summands_as(const MemPointerDecomposedForm& other) const { + if (!base().is_known() || !other.base().is_known()) { + assert(false, "unknonw base case is not answered optimally"); + return false; + } + // Known base at 0th summand: all other summands are non-base summands. + return has_same_summands_as(other, 1); + } + const MemPointerSummand& summands_at(const uint i) const { assert(i < SUMMANDS_SIZE, "in bounds"); return _summands[i]; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index a5181b21a6e42..aec8fd441a28a 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -766,6 +766,20 @@ class VPointer : public ArenaObj { return invar_count > 0 ? factor : 0; } + int count_invar_summands() const { + int invar_count = 0; + for_each_invar_summand([&] (const MemPointerSummand& s) { + invar_count++; + }); + return invar_count; + } + + bool has_same_invar_and_iv_scale_as(const VPointer& other) const { + // If we have the same invar_summands, and the same iv summand with the same iv_scale, + // then all summands except the base must be the same. + return decomposed_form().has_same_non_base_summands_as(other.decomposed_form()); + } + // Aliasing // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? bool is_adjacent_to_and_before(const VPointer& other) const; @@ -1022,6 +1036,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { virtual bool is_constrained() const override final { return true; } const MemNode* mem_ref() const { return _mem_ref; } + const VPointer& vpointer() const { return _vpointer; } virtual const ConstrainedAlignmentSolution* as_constrained() const override final { return this; } @@ -1061,12 +1076,10 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // - both mem_refs have no invariant. // - both mem_refs have the same invariant and the same iv_scale. // - assert(false, "fix invar check"); - if (s1->_invar != s2->_invar) { - return new EmptyAlignmentSolution("invar not identical"); - } - if (s1->_invar != nullptr && s1->_vpointer.iv_scale() != s2->_vpointer.iv_scale()) { - return new EmptyAlignmentSolution("has invar with different iv_scale"); + bool both_no_invar = s1->vpointer().count_invar_summands() == 0 && + s2->vpointer().count_invar_summands() == 0; + if(!both_no_invar && !s1->vpointer().has_same_invar_and_iv_scale_as(s2->vpointer())) { + return new EmptyAlignmentSolution("invar alignment term not identical"); } // Now, we have reduced the problem to: From 606ebf969ac4a5bb7e93771d2a457d8c37568a08 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 25 Nov 2024 15:14:40 +0100 Subject: [PATCH 055/130] rm invar in AlignmentSolution --- src/hotspot/share/opto/vectorization.hpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index aec8fd441a28a..4c387436158f1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1013,8 +1013,6 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const int _r; // Use VPointer for invar and iv_scale const VPointer& _vpointer; - // TODO use VPointer for invar and iv_scale - const Node* _invar; public: ConstrainedAlignmentSolution(const MemNode* mem_ref, const int q, @@ -1023,8 +1021,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { _mem_ref(mem_ref), _q(q), _r(r), - _vpointer(vpointer), - _invar(nullptr) + _vpointer(vpointer) { assert(q > 1 && is_power_of_2(q), "q must be power of 2"); assert(0 <= r && r < q, "r must be in modulo space of q"); @@ -1076,9 +1073,12 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // - both mem_refs have no invariant. // - both mem_refs have the same invariant and the same iv_scale. // - bool both_no_invar = s1->vpointer().count_invar_summands() == 0 && - s2->vpointer().count_invar_summands() == 0; - if(!both_no_invar && !s1->vpointer().has_same_invar_and_iv_scale_as(s2->vpointer())) { + // Use VPointer to do checks on invar and iv_scale: + const VPointer& p1 = s1->vpointer(); + const VPointer& p2 = s2->vpointer(); + bool both_no_invar = p1.count_invar_summands() == 0 && + p2.count_invar_summands() == 0; + if(!both_no_invar && !p1.has_same_invar_and_iv_scale_as(p2)) { return new EmptyAlignmentSolution("invar alignment term not identical"); } @@ -1120,8 +1120,8 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { virtual void print() const override final { tty->print("m * q(%d) + r(%d)", _q, _r); - if (_invar != nullptr) { - tty->print(" - invar[%d] / (iv_scale(%d) * pre_stride)", _invar->_idx, _vpointer.iv_scale()); + if (_vpointer.count_invar_summands() > 0) { + tty->print(" - invar / (iv_scale(%d) * pre_stride)", _vpointer.iv_scale()); } tty->print_cr(" [- init / pre_stride], mem_ref[%d]", mem_ref()->_idx); }; From 7a50c76f5c71114a30e9dec716db329b07889f3d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 25 Nov 2024 15:34:38 +0100 Subject: [PATCH 056/130] fix invar in adjust_pre_loop_limit_to_align_main_loop_vectors --- src/hotspot/share/opto/superword.cpp | 38 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 14 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 640a50ce50321..6240b21ecddd6 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2842,7 +2842,6 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { const int con = p.con(); Node* base = p.decomposed_form().base().object_or_native(); bool is_base_native = p.decomposed_form().base().is_native(); - Node* invar = nullptr; // TODO // TODO: maybe use NoOverflowInt here, and for solver? @@ -2859,11 +2858,15 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { tty->print_cr(" con: %d", con); tty->print(" base:"); base->dump(); - if (invar == nullptr) { + if (p.count_invar_summands() == 0) { tty->print_cr(" invar: null"); } else { - tty->print(" invar:"); - invar->dump(); + tty->print_cr(" invar_summands:"); + p.for_each_invar_summand([&] (const MemPointerSummand& s) { + tty->print(" -> "); + s.print_on(tty); + }); + tty->cr(); } tty->print(" old_limit: "); old_limit->dump(); @@ -2906,24 +2909,31 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* xbic = igvn().intcon(is_sub ? -con : con); TRACE_ALIGN_VECTOR_NODE(xbic); - // 1.2: invar (if it exists) - if (invar != nullptr) { - if (igvn().type(invar)->isa_long()) { + // 1.2: invar = SUM(invar_summands) + // We iteratively add / subtract all invar_summands, if there are any. + p.for_each_invar_summand([&] (const MemPointerSummand& s) { + Node* invar_variable = s.variable(); + jint invar_scale = s.scale().value(); + if (igvn().type(invar_variable)->isa_long()) { // Computations are done % (vector width/element size) so it's // safe to simply convert invar to an int and loose the upper 32 // bit half. - invar = new ConvL2INode(invar); - phase()->register_new_node(invar, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(invar); - } + invar_variable = new ConvL2INode(invar_variable); + phase()->register_new_node(invar_variable, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(invar_variable); + } + Node* invar_scale_con = igvn().intcon(invar_scale); + Node* invar_summand = new MulINode(invar_variable, invar_scale_con); + phase()->register_new_node(invar_summand, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(invar_summand); if (is_sub) { - xbic = new SubINode(xbic, invar); + xbic = new SubINode(xbic, invar_summand); } else { - xbic = new AddINode(xbic, invar); + xbic = new AddINode(xbic, invar_summand); } phase()->register_new_node(xbic, pre_ctrl); TRACE_ALIGN_VECTOR_NODE(xbic); - } + }); // 1.3: base (unless base is guaranteed aw aligned) if (aw > ObjectAlignmentInBytes || is_base_native) { From 2ba13b9108a8816877d85dea3e40b359f9ee782e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 10:02:55 +0100 Subject: [PATCH 057/130] fix large stride and scale --- src/hotspot/share/opto/vectorization.hpp | 37 ++++++++++++++++++------ 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 4c387436158f1..59fa24da7a0ca 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -719,8 +719,8 @@ class VPointer : public ArenaObj { _vloop(vloop), _decomposed_form(init_decomposed_form(mem, adr_node_callback)), _size(mem->memory_size()), - _iv_scale(init_iv_scale(_decomposed_form, _vloop)), - _is_valid(init_is_valid(_decomposed_form, _vloop)) + _iv_scale(init_iv_scale()), + _is_valid(init_is_valid()) { #ifndef PRODUCT if (vloop.mptrace().is_trace_pointer()) { @@ -807,11 +807,11 @@ class VPointer : public ArenaObj { return parser.decomposed_form(); } - static jint init_iv_scale(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { + jint init_iv_scale() const { for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { - const MemPointerSummand& summand = decomposed_form.summands_at(i); + const MemPointerSummand& summand = _decomposed_form.summands_at(i); Node* variable = summand.variable(); - if (variable == vloop.iv()) { + if (variable == _vloop.iv()) { return summand.scale().value(); } } @@ -821,8 +821,8 @@ class VPointer : public ArenaObj { // Check that all variables are either the iv, or else invariants. // TODO why pre-loop - static bool init_is_valid(const MemPointerDecomposedForm& decomposed_form, const VLoop& vloop) { - if (!decomposed_form.base().is_known()) { + bool init_is_valid() const { + if (!_decomposed_form.base().is_known()) { // VPointer needs to know if it is native (off-heap) or object (on-heap). // We may for example have failed to fully decompose the MemPointer, possibly // because such a decomposition is not considered safe. @@ -830,12 +830,31 @@ class VPointer : public ArenaObj { } for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { - const MemPointerSummand& summand = decomposed_form.summands_at(i); + const MemPointerSummand& summand = _decomposed_form.summands_at(i); Node* variable = summand.variable(); - if (variable != nullptr && variable != vloop.iv() && !is_invariant(variable, vloop)) { + if (variable != nullptr && variable != _vloop.iv() && !is_invariant(variable, _vloop)) { return false; } } + + // In the pointer analysis, and especially the AlignVector, analysis we assume that + // stride and scale are not too large. For example, we multiply "iv_scale * iv_stride", + // and assume that this does not overflow the int range. We also take "abs(iv_scale)" + // and "abs(iv_stride)", which would overflow for min_int = -(2^31). Still, we want + // to at least allow small and moderately large stride and scale. Therefore, we + // allow values up to 2^30, which is only a factor 2 smaller than the max/min int. + // Normal performance relevant code will have much lower values. And the restriction + // allows us to keep the rest of the autovectorization code much simpler, since we + // do not have to deal with overflows. + jlong long_iv_scale = _iv_scale; + jlong long_iv_stride = _vloop.iv_stride(); + jlong max_val = 1 << 30; + if (abs(long_iv_scale) >= max_val || + abs(long_iv_stride) >= max_val || + abs(long_iv_scale * long_iv_stride) >= max_val) { + return false; + } + return true; } From 904d709c9be5507a49e8b8b0a5b977fb7c309fec Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 10:09:56 +0100 Subject: [PATCH 058/130] rm useless TODOs --- src/hotspot/share/opto/superword.cpp | 4 +--- src/hotspot/share/opto/vectorization.hpp | 2 +- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 6240b21ecddd6..a7f1cdeb6a6f5 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2843,8 +2843,6 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { Node* base = p.decomposed_form().base().object_or_native(); bool is_base_native = p.decomposed_form().base().is_native(); - // TODO: maybe use NoOverflowInt here, and for solver? - #ifdef ASSERT if (_trace._align_vector) { tty->print_cr("\nVTransform::adjust_pre_loop_limit_to_align_main_loop_vectors:"); @@ -2876,7 +2874,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { #endif if (iv_stride == 0 || !is_power_of_2(abs(iv_stride)) || - iv_scale == 0 || !is_power_of_2(abs(iv_scale)) || // TODO abs ok? + iv_scale == 0 || !is_power_of_2(abs(iv_scale)) || abs(iv_scale) >= aw) { #ifdef ASSERT if (_trace._align_vector) { diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 59fa24da7a0ca..7e5f0e940979d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -711,7 +711,7 @@ class VPointer : public ArenaObj { // Derived, for quicker use. const jint _iv_scale; - const bool _is_valid; // TODO any accessor should assert if not valid! + const bool _is_valid; public: template From 8fa0c5b304ae6677ff202a55860379634c203278 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 11:52:32 +0100 Subject: [PATCH 059/130] route in trace --- src/hotspot/share/opto/mempointer.cpp | 25 +++-- src/hotspot/share/opto/mempointer.hpp | 127 ++++++++++++++++------- src/hotspot/share/opto/vectorization.hpp | 9 +- 3 files changed, 110 insertions(+), 51 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 240f7b2b10344..f3e76ce28dec6 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -33,7 +33,9 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); + // TODO maybe refactor out _mem? Node* pointer = _mem->in(MemNode::Address); + const jint size = _mem->memory_size(); // Start with the trivial summand. _worklist.push(MemPointerSummand(pointer, NoOverflowInt(1))); @@ -43,12 +45,16 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C int traversal_count = 0; while (_worklist.is_nonempty()) { // Bail out if the graph is too complex. - if (traversal_count++ > 1000) { return MemPointerDecomposedForm::make_trivial(pointer); } + if (traversal_count++ > 1000) { + return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + } parse_sub_expression(_worklist.pop(), adr_node_callback); } // Bail out if there is a constant overflow. - if (_con.is_NaN()) { return MemPointerDecomposedForm::make_trivial(pointer); } + if (_con.is_NaN()) { + return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + } // Sorting by variable idx means that all summands with the same variable are consecutive. // This simplifies the combining of summands with the same variable below. @@ -68,7 +74,7 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C } // Bail out if scale is NaN. if (scale.is_NaN()) { - return MemPointerDecomposedForm::make_trivial(pointer); + return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); } // Keep summands with non-zero scale. if (!scale.is_zero()) { @@ -77,7 +83,7 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C } _summands.trunc_to(pos_put); - return MemPointerDecomposedForm::make(pointer, _summands, _con); + return MemPointerDecomposedForm::make(pointer, _summands, _con, size NOT_PRODUCT(COMMA _trace)); } // Parse a sub-expression of the pointer, starting at the current summand. We parse the @@ -454,17 +460,14 @@ bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as return has_same_summands_as(other, 1); } -bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { - const MemPointerDecomposedForm& s1 = decomposed_form(); - const MemPointerDecomposedForm& s2 = other.decomposed_form(); - const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _trace )); - const jint size = mem()->memory_size(); - const bool is_adjacent = aliasing.is_always_at_distance(size); +bool MemPointerDecomposedForm::is_adjacent_to_and_before(const MemPointerDecomposedForm& other) const { + const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); + const bool is_adjacent = aliasing.is_always_at_distance(_size); #ifndef PRODUCT if (_trace.is_trace_adjacency()) { tty->print("Adjacent: %s, because size = %d and aliasing = ", - is_adjacent ? "true" : "false", size); + is_adjacent ? "true" : "false", _size); aliasing.print_on(tty); tty->cr(); } diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 91fc1b5416ce3..4293ea07077c4 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -28,8 +28,18 @@ #include "opto/memnode.hpp" #include "opto/noOverflowInt.hpp" -// The MemPointer is a shared facility to parse pointers and check the aliasing of pointers, -// e.g. checking if two stores are adjacent. +// The MemPointer is a shared facility to parse pointers and check the aliasing of pointers. +// +// A MemPointer points to a region in memory, starting at a "pointer", and extending for "size" bytes: +// [pointer, pointer + size) +// +// We can check if two loads / two stores: +// - are adjacent -> pack multiple memops into a single memop +// - never overlap -> independent, can swap order +// +// Other use-cases: +// - alignment -> find an alignment solution for all memops in a vectorized loop +// - detect partial overlap -> indicates store-to-load-forwarding failures // // ----------------------------------------------------------------------------------------- // @@ -561,27 +571,42 @@ class MemPointerDecomposedForm : public StackObj { }; private: + NOT_PRODUCT( const TraceMemPointer& _trace; ) MemPointerSummand _summands[SUMMANDS_SIZE]; NoOverflowInt _con; Base _base; + jint _size; public: - // Empty - MemPointerDecomposedForm() : _con(NoOverflowInt::make_NaN()) {} + // Empty - TODO why? + MemPointerDecomposedForm(NOT_PRODUCT(const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) + _con(NoOverflowInt::make_NaN()) {} private: // Default / trivial: pointer = 0 + 1 * pointer - MemPointerDecomposedForm(Node* pointer) : + MemPointerDecomposedForm(Node* pointer, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) _con(NoOverflowInt(0)), - _base(Base()) + _base(Base()), + _size(size) { assert(pointer != nullptr, "pointer must be non-null"); _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1)); + assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); } - MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) : + MemPointerDecomposedForm(Node* pointer, + const GrowableArray& summands, + const NoOverflowInt& con, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) _con(con), - _base(Base::make(pointer, summands)) + _base(Base::make(pointer, summands)), + _size(size) { assert(!_con.is_NaN(), "non-NaN constant"); assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); @@ -608,23 +633,32 @@ class MemPointerDecomposedForm : public StackObj { _summands[pos++] = summands.at(i); } assert(pos == summands.length(), "copied all summands"); + + assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); } public: - static MemPointerDecomposedForm make_trivial(Node* pointer) { - return MemPointerDecomposedForm(pointer); + static MemPointerDecomposedForm make_trivial(Node* pointer, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { + return MemPointerDecomposedForm(pointer, size NOT_PRODUCT(COMMA trace)); } - static MemPointerDecomposedForm make(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con) { + static MemPointerDecomposedForm make(Node* pointer, + const GrowableArray& summands, + const NoOverflowInt& con, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { if (summands.length() <= SUMMANDS_SIZE) { - return MemPointerDecomposedForm(pointer, summands, con); + return MemPointerDecomposedForm(pointer, summands, con, size NOT_PRODUCT(COMMA trace)); } else { - return MemPointerDecomposedForm::make_trivial(pointer); + return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); } } + // TODO make private? MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other - NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const; + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) const; private: bool has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const; @@ -648,6 +682,7 @@ class MemPointerDecomposedForm : public StackObj { const NoOverflowInt con() const { return _con; } const Base& base() const { return _base; } + jint size() const { return _size; } static int cmp_summands(const MemPointerDecomposedForm& a, const MemPointerDecomposedForm& b) { for (int i = 0; i < SUMMANDS_SIZE; i++) { @@ -669,6 +704,8 @@ class MemPointerDecomposedForm : public StackObj { } } + bool is_adjacent_to_and_before(const MemPointerDecomposedForm& other) const; + #ifndef PRODUCT void print_form_on(outputStream* st) const { if (_con.is_NaN()) { @@ -697,7 +734,17 @@ class MemPointerDecomposedForm : public StackObj { }; class MemPointerDecomposedFormParser : public StackObj { +public: + class Callback : public StackObj { + public: + virtual void callback(Node* n) { /* do nothing by default */ } + }; + private: + Callback _empty_callback; + + NOT_PRODUCT( const TraceMemPointer& _trace; ) + const MemNode* _mem; // Internal data-structures for parsing. @@ -709,23 +756,22 @@ class MemPointerDecomposedFormParser : public StackObj { MemPointerDecomposedForm _decomposed_form; public: - class Callback : public StackObj { - public: - virtual void callback(Node* n) { /* do nothing by default */ } - }; - - MemPointerDecomposedFormParser(const MemNode* mem) : - _mem(mem), _con(NoOverflowInt(0)) - { - Callback empty_callback; - _decomposed_form = parse_decomposed_form(empty_callback); - } + // No callback. + MemPointerDecomposedFormParser(const MemNode* mem + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) + _mem(mem), + _con(NoOverflowInt(0)), + _decomposed_form(parse_decomposed_form(_empty_callback)) {} - MemPointerDecomposedFormParser(const MemNode* mem, Callback& adr_node_callback) : - _mem(mem), _con(NoOverflowInt(0)) - { - _decomposed_form = parse_decomposed_form(adr_node_callback); - } + // With callback. + MemPointerDecomposedFormParser(const MemNode* mem, + Callback& adr_node_callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) + _mem(mem), + _con(NoOverflowInt(0)), + _decomposed_form(parse_decomposed_form(adr_node_callback)) {} const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } @@ -737,20 +783,21 @@ class MemPointerDecomposedFormParser : public StackObj { bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; }; +// TODO maybe merge with decomposed form? // Facility to parse the pointer of a Load or Store, so that aliasing between two such // memory operations can be determined (e.g. adjacency). class MemPointer : public StackObj { private: + NOT_PRODUCT( const TraceMemPointer& _trace; ) + const MemNode* _mem; const MemPointerDecomposedForm _decomposed_form; - NOT_PRODUCT( const TraceMemPointer& _trace; ) - public: - MemPointer(const MemNode* mem NOT_PRODUCT( COMMA const TraceMemPointer& trace)) : + MemPointer(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), - _decomposed_form(init_decomposed_form(_mem)) - NOT_PRODUCT( COMMA _trace(trace) ) + _decomposed_form(init_decomposed_form()) { #ifndef PRODUCT if (_trace.is_trace_pointer()) { @@ -764,13 +811,15 @@ class MemPointer : public StackObj { const MemNode* mem() const { return _mem; } const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; } - bool is_adjacent_to_and_before(const MemPointer& other) const; + bool is_adjacent_to_and_before(const MemPointer& other) const { + return decomposed_form().is_adjacent_to_and_before(other.decomposed_form()); + } private: - static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem) { - assert(mem->is_Store(), "only stores are supported"); + const MemPointerDecomposedForm init_decomposed_form() { + assert(_mem->is_Store(), "only stores are supported"); ResourceMark rm; - MemPointerDecomposedFormParser parser(mem); + MemPointerDecomposedFormParser parser(_mem NOT_PRODUCT(COMMA _trace)); return parser.decomposed_form(); } }; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 7e5f0e940979d..5f8aa92420008 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -803,7 +803,14 @@ class VPointer : public ArenaObj { static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, Callback& adr_node_callback) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; - MemPointerDecomposedFormParser parser(mem, adr_node_callback); + // TODO wire in +#ifndef PRODUCT + const TraceMemPointer trace(false, + false, + false, + true); +#endif + MemPointerDecomposedFormParser parser(mem, adr_node_callback NOT_PRODUCT(COMMA trace)); return parser.decomposed_form(); } From 850de72f8a41f8a0db1428f580df2b6a6cedd25f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 13:04:21 +0100 Subject: [PATCH 060/130] refactor out aliasing to MemPointerDecomposedFormParser --- src/hotspot/share/opto/mempointer.cpp | 29 ++++++++++++ src/hotspot/share/opto/mempointer.hpp | 1 + src/hotspot/share/opto/vectorization.cpp | 58 ------------------------ src/hotspot/share/opto/vectorization.hpp | 36 +++++++++------ 4 files changed, 52 insertions(+), 72 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index f3e76ce28dec6..e241e2b9128cc 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -475,3 +475,32 @@ bool MemPointerDecomposedForm::is_adjacent_to_and_before(const MemPointerDecompo return is_adjacent; } + +bool MemPointerDecomposedForm::never_overlaps_with(const MemPointerDecomposedForm& other) const { + const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); + + // The aliasing tries to compute: + // distance = other - this + // + // We know that we have no overlap if we can prove: + // this >= other + other.size || this + this.size <= other + // + // Which we can restate as: + // distance <= -other.size || this.size <= distance + // + const jint distance_lo = -other.size(); + const jint distance_hi = size(); + bool is_never_overlap = aliasing.is_never_in_distance_range(distance_lo, distance_hi); + +#ifndef PRODUCT + if (_trace.is_trace_overlap()) { + tty->print("Never Overlap: %s, distance_lo: %d, distance_hi: %d, aliasing: ", + is_never_overlap ? "true" : "false", distance_lo, distance_hi); + aliasing.print_on(tty); + tty->cr(); + } +#endif + + return is_never_overlap; +} + diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 4293ea07077c4..70381c3ed0d28 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -705,6 +705,7 @@ class MemPointerDecomposedForm : public StackObj { } bool is_adjacent_to_and_before(const MemPointerDecomposedForm& other) const; + bool never_overlaps_with(const MemPointerDecomposedForm& other) const; #ifndef PRODUCT void print_form_on(outputStream* st) const { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index f3c81c960fb8e..5dd495edc9a50 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -393,64 +393,6 @@ void VLoopDependencyGraph::PredsIterator::next() { } } -bool VPointer::is_adjacent_to_and_before(const VPointer& other) const { - const MemPointerDecomposedForm& s1 = decomposed_form(); - const MemPointerDecomposedForm& s2 = other.decomposed_form(); - const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _vloop.mptrace() )); - const bool is_adjacent = aliasing.is_always_at_distance(_size); - -#ifndef PRODUCT - if (_vloop.mptrace().is_trace_adjacency()) { - tty->print("Adjacent: %s, because size = %d and aliasing = ", - is_adjacent ? "true" : "false", _size); - aliasing.print_on(tty); - tty->cr(); - } -#endif - - return is_adjacent; -} - -bool VPointer::never_overlaps_with(const VPointer& other) const { - if (!is_valid() || !other.is_valid()) { -#ifndef PRODUCT - if (_vloop.mptrace().is_trace_overlap()) { - tty->print_cr("Never Overlap: false, because of invalid VPointer."); - } -#endif - - return false; - } - - const MemPointerDecomposedForm& s1 = decomposed_form(); - const MemPointerDecomposedForm& s2 = other.decomposed_form(); - const MemPointerAliasing aliasing = s1.get_aliasing_with(s2 NOT_PRODUCT( COMMA _vloop.mptrace() )); - - // The aliasing tries to compute: - // distance = s2 - s1 - // - // We know that we have no overlap if we can prove: - // s1 >= s2 + s2_size || s1 + s1_size <= s2 - // - // Which we can restate as: - // distance <= -s2_size || s1_size <= distance - // - const jint distance_lo = -other.size(); - const jint distance_hi = size(); - bool is_never_overlap = aliasing.is_never_in_distance_range(distance_lo, distance_hi); - -#ifndef PRODUCT - if (_vloop.mptrace().is_trace_overlap()) { - tty->print("Never Overlap: %s, distance_lo: %d, distance_hi: %d, aliasing: ", - is_never_overlap ? "true" : "false", distance_lo, distance_hi); - aliasing.print_on(tty); - tty->cr(); - } -#endif - - return is_never_overlap; -} - #ifndef PRODUCT void VPointer::print_on(outputStream* st) const { st->print("VPointer["); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 5f8aa92420008..2d2136551b39d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -717,7 +717,7 @@ class VPointer : public ArenaObj { template VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _vloop(vloop), - _decomposed_form(init_decomposed_form(mem, adr_node_callback)), + _decomposed_form(init_decomposed_form(mem, adr_node_callback, vloop)), _size(mem->memory_size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) @@ -780,10 +780,21 @@ class VPointer : public ArenaObj { return decomposed_form().has_same_non_base_summands_as(other.decomposed_form()); } - // Aliasing - // TODO refactor together with MemPointer - should be shared code. Maybe the _size needs to be in ...Form? - bool is_adjacent_to_and_before(const VPointer& other) const; - bool never_overlaps_with(const VPointer& other) const; + bool is_adjacent_to_and_before(const VPointer& other) const { + return decomposed_form().is_adjacent_to_and_before(other.decomposed_form()); + } + + bool never_overlaps_with(const VPointer& other) const { + if (!is_valid() || !other.is_valid()) { +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_overlap()) { + tty->print_cr("Never Overlap: false, because of invalid VPointer."); + } +#endif + return false; + } + return decomposed_form().never_overlaps_with(other.decomposed_form()); + } bool overlap_possible_with_any_in(const GrowableArray& nodes) const { MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? @@ -800,17 +811,14 @@ class VPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, Callback& adr_node_callback) { + static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, + Callback& adr_node_callback, + const VLoop& vloop) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; - // TODO wire in -#ifndef PRODUCT - const TraceMemPointer trace(false, - false, - false, - true); -#endif - MemPointerDecomposedFormParser parser(mem, adr_node_callback NOT_PRODUCT(COMMA trace)); + MemPointerDecomposedFormParser parser(mem, + adr_node_callback + NOT_PRODUCT(COMMA vloop.mptrace())); return parser.decomposed_form(); } From 70793a25286cc78797c043f8f1da584b87642410 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 13:11:12 +0100 Subject: [PATCH 061/130] cleanup --- src/hotspot/share/opto/mempointer.hpp | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 70381c3ed0d28..7f090bdd087d2 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -577,13 +577,6 @@ class MemPointerDecomposedForm : public StackObj { Base _base; jint _size; -public: - // Empty - TODO why? - MemPointerDecomposedForm(NOT_PRODUCT(const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) - _con(NoOverflowInt::make_NaN()) {} - -private: // Default / trivial: pointer = 0 + 1 * pointer MemPointerDecomposedForm(Node* pointer, const jint size @@ -656,11 +649,10 @@ class MemPointerDecomposedForm : public StackObj { } } - // TODO make private? +private: MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other NOT_PRODUCT(COMMA const TraceMemPointer& trace)) const; -private: bool has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const; bool has_same_summands_as(const MemPointerDecomposedForm& other) const { return has_same_summands_as(other, 0); } bool has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const; From 08ab4124254a883f340d21754797992f60f922f0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 13:15:38 +0100 Subject: [PATCH 062/130] more cleanup --- src/hotspot/share/opto/mempointer.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 7f090bdd087d2..7103943d1f37b 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -573,9 +573,9 @@ class MemPointerDecomposedForm : public StackObj { private: NOT_PRODUCT( const TraceMemPointer& _trace; ) MemPointerSummand _summands[SUMMANDS_SIZE]; - NoOverflowInt _con; - Base _base; - jint _size; + const NoOverflowInt _con; + const Base _base; + const jint _size; // Default / trivial: pointer = 0 + 1 * pointer MemPointerDecomposedForm(Node* pointer, @@ -591,6 +591,7 @@ class MemPointerDecomposedForm : public StackObj { assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); } + // pointer = SUM(SUMMANDS) + con MemPointerDecomposedForm(Node* pointer, const GrowableArray& summands, const NoOverflowInt& con, From 3a8d07ececb2e5264311ff370d39ee00d0d10686 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 13:41:52 +0100 Subject: [PATCH 063/130] MemPointer renaming --- src/hotspot/share/opto/memnode.cpp | 4 +- src/hotspot/share/opto/mempointer.cpp | 39 ++++--- src/hotspot/share/opto/mempointer.hpp | 131 +++++++++++------------ src/hotspot/share/opto/superword.cpp | 6 +- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 24 ++--- src/hotspot/share/opto/vtransform.cpp | 2 +- 7 files changed, 102 insertions(+), 106 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 5be3b302fd7c4..4b45e95c1dae1 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,8 +2947,8 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_adjacency(), true); #endif - const MemPointer pointer_use(use_store NOT_PRODUCT( COMMA trace )); - const MemPointer pointer_def(def_store NOT_PRODUCT( COMMA trace )); + const MemPointerX pointer_use(use_store NOT_PRODUCT( COMMA trace )); + const MemPointerX pointer_def(def_store NOT_PRODUCT( COMMA trace )); return pointer_def.is_adjacent_to_and_before(pointer_use); } diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index e241e2b9128cc..357deeaabfb66 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -29,7 +29,7 @@ // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(Callback& adr_node_callback) { +MemPointer MemPointerParser::parse_decomposed_form(Callback& adr_node_callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); @@ -46,14 +46,14 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C while (_worklist.is_nonempty()) { // Bail out if the graph is too complex. if (traversal_count++ > 1000) { - return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); } parse_sub_expression(_worklist.pop(), adr_node_callback); } // Bail out if there is a constant overflow. if (_con.is_NaN()) { - return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); } // Sorting by variable idx means that all summands with the same variable are consecutive. @@ -74,7 +74,7 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C } // Bail out if scale is NaN. if (scale.is_NaN()) { - return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); } // Keep summands with non-zero scale. if (!scale.is_zero()) { @@ -83,13 +83,13 @@ MemPointerDecomposedForm MemPointerDecomposedFormParser::parse_decomposed_form(C } _summands.trunc_to(pos_put); - return MemPointerDecomposedForm::make(pointer, _summands, _con, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make(pointer, _summands, _con, size NOT_PRODUCT(COMMA _trace)); } // Parse a sub-expression of the pointer, starting at the current summand. We parse the // current node, and see if it can be decomposed into further summands, or if the current // summand is terminal. -void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback) { +void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback) { Node* n = summand.variable(); const NoOverflowInt scale = summand.scale(); const NoOverflowInt one(1); @@ -202,7 +202,7 @@ void MemPointerDecomposedFormParser::parse_sub_expression(const MemPointerSumman // Check if the decomposition of operation opc is guaranteed to be safe. // Please refer to the definition of "safe decomposition" in mempointer.hpp -bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const { +bool MemPointerParser::is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const { #ifndef _LP64 // On 32-bit platforms, the pointer has 32bits, and thus any higher bits will always // be truncated. Thus, it does not matter if we have int or long overflows. @@ -312,7 +312,7 @@ bool MemPointerDecomposedFormParser::is_safe_to_decompose_op(const int opc, cons #endif } -MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::make(Node* pointer, const GrowableArray& summands) { +MemPointer::Base MemPointer::Base::make(Node* pointer, const GrowableArray& summands) { // Bad form -> unknown. AddPNode* adr = pointer->isa_AddP(); if (adr == nullptr) { return Base(); } @@ -334,7 +334,7 @@ MemPointerDecomposedForm::Base MemPointerDecomposedForm::Base::make(Node* pointe } } -Node* MemPointerDecomposedForm::Base::find_base(Node* object_base, const GrowableArray& summands) { +Node* MemPointer::Base::find_base(Node* object_base, const GrowableArray& summands) { for (int i = 0; i < summands.length(); i++) { const MemPointerSummand& s = summands.at(i); assert(s.variable() != nullptr, "no empty summands"); @@ -350,20 +350,19 @@ Node* MemPointerDecomposedForm::Base::find_base(Node* object_base, const Growabl return nullptr; } -// Compute the aliasing between two MemPointerDecomposedForm. We use the "MemPointer Lemma" to -// prove that the computed aliasing also applies for the underlying pointers. Note that the -// condition (S0) is already given, because the MemPointerDecomposedForm is always constructed -// using only safe decompositions. +// Compute the aliasing between two MemPointer. We use the "MemPointer Lemma" to prove that the +// computed aliasing also applies for the underlying pointers. Note that the condition (S0) is +// already given, because the MemPointer is always constructed using only safe decompositions. // // Pre-Condition: // We assume that both pointers are in-bounds of their respective memory object. If this does // not hold, for example, with the use of Unsafe, then we would already have undefined behavior, // and we are allowed to do anything. -MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerDecomposedForm& other - NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const { +MemPointerAliasing MemPointer::get_aliasing_with(const MemPointer& other + NOT_PRODUCT( COMMA const TraceMemPointer& trace) ) const { #ifndef PRODUCT if (trace.is_trace_aliasing()) { - tty->print_cr("MemPointerDecomposedForm::get_aliasing_with:"); + tty->print_cr("MemPointer::get_aliasing_with:"); print_on(tty); other.print_on(tty); } @@ -435,14 +434,14 @@ MemPointerAliasing MemPointerDecomposedForm::get_aliasing_with(const MemPointerD } } -bool MemPointerDecomposedForm::has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const { +bool MemPointer::has_same_summands_as(const MemPointer& other, uint start) const { for (uint i = start; i < SUMMANDS_SIZE; i++) { if (summands_at(i) != other.summands_at(i)) { return false; } } return true; } -bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const { +bool MemPointer::has_different_base_but_otherwise_same_summands_as(const MemPointer& other) const { if (!base().is_object() || !other.base().is_object() || base().object() == other.base().object()) { @@ -460,7 +459,7 @@ bool MemPointerDecomposedForm::has_different_base_but_otherwise_same_summands_as return has_same_summands_as(other, 1); } -bool MemPointerDecomposedForm::is_adjacent_to_and_before(const MemPointerDecomposedForm& other) const { +bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); const bool is_adjacent = aliasing.is_always_at_distance(_size); @@ -476,7 +475,7 @@ bool MemPointerDecomposedForm::is_adjacent_to_and_before(const MemPointerDecompo return is_adjacent; } -bool MemPointerDecomposedForm::never_overlaps_with(const MemPointerDecomposedForm& other) const { +bool MemPointer::never_overlaps_with(const MemPointer& other) const { const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); // The aliasing tries to compute: diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 7103943d1f37b..59600999456d1 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -153,7 +153,7 @@ // // ----------------------------------------------------------------------------------------- // -// MemPointerDecomposedForm: +// MemPointer: // When the pointer is parsed, it is decomposed into a SUM of summands plus a constant: // // pointer = SUM(summands) + con @@ -171,17 +171,6 @@ // On 64-bit systems, this decomposed form is computed with long-add/mul, on 32-bit systems // it is computed with int-add/mul. // -// MemPointerAliasing: -// The decomposed form allows us to determine the aliasing between two pointers easily. For -// example, if two pointers are identical, except for their constant: -// -// pointer1 = SUM(summands) + con1 -// pointer2 = SUM(summands) + con2 -// -// then we can easily compute the distance between the pointers (distance = con2 - con1), -// and determine if they are adjacent. -// -// MemPointerDecomposedFormParser: // Any pointer can be parsed into this (default / trivial) decomposed form: // // pointer = 1 * pointer + 0 @@ -204,11 +193,20 @@ // // This allows us to easily see that these two pointers are adjacent (distance = 4). // -// Hence, in MemPointerDecomposedFormParser::parse_decomposed_form, we start with the pointer as -// a trivial summand. A summand can either be decomposed further or it is terminal (cannot -// be decomposed further). We decompose the summands recursively until all remaining summands -// are terminal, see MemPointerDecomposedFormParser::parse_sub_expression. This effectively parses -// the pointer expression recursively. +// Hence, in MemPointerParser::parse_decomposed_form, we start with the pointer as a trivial summand. +// A summand can either be decomposed further or it is terminal (cannot be decomposed further). +// We decompose the summands recursively until all remaining summands are terminal, see +// MemPointerParser::parse_sub_expression. This effectively parses the pointer expression recursively. +// +// MemPointerAliasing: +// The decomposed form allows us to determine the aliasing between two pointers easily. For +// example, if two pointers are identical, except for their constant: +// +// pointer1 = SUM(summands) + con1 +// pointer2 = SUM(summands) + con2 +// +// then we can easily compute the distance between the pointers (distance = con2 - con1), +// and determine if they are adjacent. // // ----------------------------------------------------------------------------------------- // @@ -269,12 +267,11 @@ // mp1 and mp2: // p1 - p2 = mp1 - mp2 // -// Note: MemPointerDecomposedForm::get_aliasing_with relies on this MemPointer Lemma to -// prove the correctness of its aliasing computation between two MemPointers. +// Note: MemPointer::get_aliasing_with relies on this MemPointer Lemma to prove the correctness of its +// aliasing computation between two MemPointers. // // -// Note: MemPointerDecomposedFormParser::is_safe_to_decompose_op checks that all -// decompositions we apply are safe. +// Note: MemPointerParser::is_safe_to_decompose_op checks that all decompositions we apply are safe. // // // Proof of the "MemPointer Lemma": @@ -445,7 +442,7 @@ class MemPointerAliasing { #endif }; -// Summand of a MemPointerDecomposedForm: +// Summand of a MemPointer: // // summand = scale * variable // @@ -519,7 +516,7 @@ class MemPointerSummand : public StackObj { // Node: if the base is known, then it is in the 0th summand. A base can be: // - on-heap / object: base().object() // - off-heap / native: base().native() -class MemPointerDecomposedForm : public StackObj { +class MemPointer : public StackObj { public: // We limit the number of summands to 10. This is just a best guess, and not at this // point supported by evidence. But I think it is reasonable: usually, a pointer @@ -578,9 +575,9 @@ class MemPointerDecomposedForm : public StackObj { const jint _size; // Default / trivial: pointer = 0 + 1 * pointer - MemPointerDecomposedForm(Node* pointer, - const jint size - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + MemPointer(Node* pointer, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _con(NoOverflowInt(0)), _base(Base()), @@ -592,11 +589,11 @@ class MemPointerDecomposedForm : public StackObj { } // pointer = SUM(SUMMANDS) + con - MemPointerDecomposedForm(Node* pointer, - const GrowableArray& summands, - const NoOverflowInt& con, - const jint size - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + MemPointer(Node* pointer, + const GrowableArray& summands, + const NoOverflowInt& con, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _con(con), _base(Base::make(pointer, summands)), @@ -632,34 +629,34 @@ class MemPointerDecomposedForm : public StackObj { } public: - static MemPointerDecomposedForm make_trivial(Node* pointer, - const jint size - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { - return MemPointerDecomposedForm(pointer, size NOT_PRODUCT(COMMA trace)); + static MemPointer make_trivial(Node* pointer, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { + return MemPointer(pointer, size NOT_PRODUCT(COMMA trace)); } - static MemPointerDecomposedForm make(Node* pointer, - const GrowableArray& summands, - const NoOverflowInt& con, - const jint size - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { + static MemPointer make(Node* pointer, + const GrowableArray& summands, + const NoOverflowInt& con, + const jint size + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { if (summands.length() <= SUMMANDS_SIZE) { - return MemPointerDecomposedForm(pointer, summands, con, size NOT_PRODUCT(COMMA trace)); + return MemPointer(pointer, summands, con, size NOT_PRODUCT(COMMA trace)); } else { - return MemPointerDecomposedForm::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); } } private: - MemPointerAliasing get_aliasing_with(const MemPointerDecomposedForm& other + MemPointerAliasing get_aliasing_with(const MemPointer& other NOT_PRODUCT(COMMA const TraceMemPointer& trace)) const; - bool has_same_summands_as(const MemPointerDecomposedForm& other, uint start) const; - bool has_same_summands_as(const MemPointerDecomposedForm& other) const { return has_same_summands_as(other, 0); } - bool has_different_base_but_otherwise_same_summands_as(const MemPointerDecomposedForm& other) const; + bool has_same_summands_as(const MemPointer& other, uint start) const; + bool has_same_summands_as(const MemPointer& other) const { return has_same_summands_as(other, 0); } + bool has_different_base_but_otherwise_same_summands_as(const MemPointer& other) const; public: - bool has_same_non_base_summands_as(const MemPointerDecomposedForm& other) const { + bool has_same_non_base_summands_as(const MemPointer& other) const { if (!base().is_known() || !other.base().is_known()) { assert(false, "unknonw base case is not answered optimally"); return false; @@ -677,7 +674,7 @@ class MemPointerDecomposedForm : public StackObj { const Base& base() const { return _base; } jint size() const { return _size; } - static int cmp_summands(const MemPointerDecomposedForm& a, const MemPointerDecomposedForm& b) { + static int cmp_summands(const MemPointer& a, const MemPointer& b) { for (int i = 0; i < SUMMANDS_SIZE; i++) { const MemPointerSummand& s_a = a.summands_at(i); const MemPointerSummand& s_b = b.summands_at(i); @@ -697,8 +694,8 @@ class MemPointerDecomposedForm : public StackObj { } } - bool is_adjacent_to_and_before(const MemPointerDecomposedForm& other) const; - bool never_overlaps_with(const MemPointerDecomposedForm& other) const; + bool is_adjacent_to_and_before(const MemPointer& other) const; + bool never_overlaps_with(const MemPointer& other) const; #ifndef PRODUCT void print_form_on(outputStream* st) const { @@ -717,7 +714,7 @@ class MemPointerDecomposedForm : public StackObj { } void print_on(outputStream* st, bool end_with_cr = true) const { - st->print("MemPointerDecomposedForm[base: "); + st->print("MemPointer[base: "); _base.print_on(st); st->print(", form: "); print_form_on(st); @@ -727,7 +724,7 @@ class MemPointerDecomposedForm : public StackObj { #endif }; -class MemPointerDecomposedFormParser : public StackObj { +class MemPointerParser : public StackObj { public: class Callback : public StackObj { public: @@ -747,11 +744,11 @@ class MemPointerDecomposedFormParser : public StackObj { GrowableArray _summands; // Resulting decomposed-form. - MemPointerDecomposedForm _decomposed_form; + MemPointer _decomposed_form; public: // No callback. - MemPointerDecomposedFormParser(const MemNode* mem + MemPointerParser(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), @@ -759,18 +756,18 @@ class MemPointerDecomposedFormParser : public StackObj { _decomposed_form(parse_decomposed_form(_empty_callback)) {} // With callback. - MemPointerDecomposedFormParser(const MemNode* mem, - Callback& adr_node_callback - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + MemPointerParser(const MemNode* mem, + Callback& adr_node_callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), _decomposed_form(parse_decomposed_form(adr_node_callback)) {} - const MemPointerDecomposedForm& decomposed_form() const { return _decomposed_form; } + const MemPointer& decomposed_form() const { return _decomposed_form; } private: - MemPointerDecomposedForm parse_decomposed_form(Callback& adr_node_callback); + MemPointer parse_decomposed_form(Callback& adr_node_callback); void parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback); @@ -780,15 +777,15 @@ class MemPointerDecomposedFormParser : public StackObj { // TODO maybe merge with decomposed form? // Facility to parse the pointer of a Load or Store, so that aliasing between two such // memory operations can be determined (e.g. adjacency). -class MemPointer : public StackObj { +class MemPointerX : public StackObj { private: NOT_PRODUCT( const TraceMemPointer& _trace; ) const MemNode* _mem; - const MemPointerDecomposedForm _decomposed_form; + const MemPointer _decomposed_form; public: - MemPointer(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + MemPointerX(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _decomposed_form(init_decomposed_form()) @@ -803,17 +800,17 @@ class MemPointer : public StackObj { #endif } - const MemNode* mem() const { return _mem; } - const MemPointerDecomposedForm decomposed_form() const { return _decomposed_form; } - bool is_adjacent_to_and_before(const MemPointer& other) const { + bool is_adjacent_to_and_before(const MemPointerX& other) const { return decomposed_form().is_adjacent_to_and_before(other.decomposed_form()); } private: - const MemPointerDecomposedForm init_decomposed_form() { + const MemPointer decomposed_form() const { return _decomposed_form; } + + const MemPointer init_decomposed_form() { assert(_mem->is_Store(), "only stores are supported"); ResourceMark rm; - MemPointerDecomposedFormParser parser(_mem NOT_PRODUCT(COMMA _trace)); + MemPointerParser parser(_mem NOT_PRODUCT(COMMA _trace)); return parser.decomposed_form(); } }; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index a7f1cdeb6a6f5..61703fc2a8152 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -50,7 +50,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : } // Collect ignored loop nodes during VPointer parsing. -class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerDecomposedFormParser::Callback { +class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParser::Callback { private: const VLoop& _vloop; const Node_List& _body; @@ -513,8 +513,8 @@ int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); // VPointer summands - return MemPointerDecomposedForm::cmp_summands(a->vpointer().decomposed_form(), - b->vpointer().decomposed_form()); + return MemPointer::cmp_summands(a->vpointer().decomposed_form(), + b->vpointer().decomposed_form()); } int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 5dd495edc9a50..358c7f05b83cc 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -190,7 +190,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + MemPointerParser::Callback empty_callback; // TODO rm? ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, empty_callback); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 2d2136551b39d..76acca11db395 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -677,7 +677,7 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// VPointer adapts the MemPointerDecomposedForm to the use in a loop: +// VPointer adapts the MemPointer to the use in a loop: // // pointer = SUM(summands) + con // @@ -702,10 +702,10 @@ class VLoopAnalyzer : StackObj { // class VPointer : public ArenaObj { private: - typedef MemPointerDecomposedFormParser::Callback Callback; + typedef MemPointerParser::Callback Callback; const VLoop& _vloop; - const MemPointerDecomposedForm _decomposed_form; + const MemPointer _decomposed_form; const jint _size; // Derived, for quicker use. @@ -734,7 +734,7 @@ class VPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } - const MemPointerDecomposedForm& decomposed_form() const { assert(_is_valid, ""); return _decomposed_form; } + const MemPointer& decomposed_form() const { assert(_is_valid, ""); return _decomposed_form; } jint size() const { assert(_is_valid, ""); return _size; } jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } jint con() const { return decomposed_form().con().value(); } @@ -797,7 +797,7 @@ class VPointer : public ArenaObj { } bool overlap_possible_with_any_in(const GrowableArray& nodes) const { - MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + MemPointerParser::Callback empty_callback; // TODO rm? for (int i = 0; i < nodes.length(); i++) { MemNode* mem = nodes.at(i)->as_Mem(); VPointer mem_p(mem->as_Mem(), _vloop, empty_callback); @@ -811,19 +811,19 @@ class VPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - static const MemPointerDecomposedForm init_decomposed_form(const MemNode* mem, - Callback& adr_node_callback, - const VLoop& vloop) { + static const MemPointer init_decomposed_form(const MemNode* mem, + Callback& adr_node_callback, + const VLoop& vloop) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; - MemPointerDecomposedFormParser parser(mem, + MemPointerParser parser(mem, adr_node_callback NOT_PRODUCT(COMMA vloop.mptrace())); return parser.decomposed_form(); } jint init_iv_scale() const { - for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { + for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _decomposed_form.summands_at(i); Node* variable = summand.variable(); if (variable == _vloop.iv()) { @@ -844,7 +844,7 @@ class VPointer : public ArenaObj { return false; } - for (uint i = 0; i < MemPointerDecomposedForm::SUMMANDS_SIZE; i++) { + for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _decomposed_form.summands_at(i); Node* variable = summand.variable(); if (variable != nullptr && variable != _vloop.iv() && !is_invariant(variable, _vloop)) { @@ -1265,7 +1265,7 @@ class AlignmentSolver { AlignmentSolution* solve() const; private: - MemPointerDecomposedForm::Base base() const { return _vpointer.decomposed_form().base();} + MemPointer::Base base() const { return _vpointer.decomposed_form().base();} jint iv_scale() const { return _vpointer.iv_scale(); } class EQ4 { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 0fe9004b7368c..1f95166b19b4e 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -581,7 +581,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // does not have any memory dependency. while (mem->is_StoreVector()) { // TODO refactor with VPointer for this vector load! - MemPointerDecomposedFormParser::Callback empty_callback; // TODO rm? + MemPointerParser::Callback empty_callback; // TODO rm? VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); if (store_p.overlap_possible_with_any_in(nodes())) { break; From 4d6b0c27fb6dd83df2bb5f3eec3de8f3fa34c5d0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 13:49:18 +0100 Subject: [PATCH 064/130] rename decomposed_form -> mem_pointer --- src/hotspot/share/opto/mempointer.cpp | 2 +- src/hotspot/share/opto/mempointer.hpp | 32 +++++++++---------- src/hotspot/share/opto/superword.cpp | 12 +++---- src/hotspot/share/opto/vectorization.cpp | 4 +-- src/hotspot/share/opto/vectorization.hpp | 40 ++++++++++++------------ src/hotspot/share/opto/vtransform.cpp | 4 +-- 6 files changed, 47 insertions(+), 47 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 357deeaabfb66..9453af98097d4 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -29,7 +29,7 @@ // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointer MemPointerParser::parse_decomposed_form(Callback& adr_node_callback) { +MemPointer MemPointerParser::parse(Callback& adr_node_callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 59600999456d1..b812574c5d252 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -193,10 +193,10 @@ // // This allows us to easily see that these two pointers are adjacent (distance = 4). // -// Hence, in MemPointerParser::parse_decomposed_form, we start with the pointer as a trivial summand. -// A summand can either be decomposed further or it is terminal (cannot be decomposed further). -// We decompose the summands recursively until all remaining summands are terminal, see -// MemPointerParser::parse_sub_expression. This effectively parses the pointer expression recursively. +// Hence, in MemPointerParser::parse, we start with the pointer as a trivial summand. A summand can either +// be decomposed further or it is terminal (cannot be decomposed further). We decompose the summands +// recursively until all remaining summands are terminal, see MemPointerParser::parse_sub_expression. +// This effectively parses the pointer expression recursively. // // MemPointerAliasing: // The decomposed form allows us to determine the aliasing between two pointers easily. For @@ -744,7 +744,7 @@ class MemPointerParser : public StackObj { GrowableArray _summands; // Resulting decomposed-form. - MemPointer _decomposed_form; + MemPointer _mem_pointer; public: // No callback. @@ -753,7 +753,7 @@ class MemPointerParser : public StackObj { NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), - _decomposed_form(parse_decomposed_form(_empty_callback)) {} + _mem_pointer(parse(_empty_callback)) {} // With callback. MemPointerParser(const MemNode* mem, @@ -762,12 +762,12 @@ class MemPointerParser : public StackObj { NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), - _decomposed_form(parse_decomposed_form(adr_node_callback)) {} + _mem_pointer(parse(adr_node_callback)) {} - const MemPointer& decomposed_form() const { return _decomposed_form; } + const MemPointer& mem_pointer() const { return _mem_pointer; } private: - MemPointer parse_decomposed_form(Callback& adr_node_callback); + MemPointer parse(Callback& adr_node_callback); void parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback); @@ -782,36 +782,36 @@ class MemPointerX : public StackObj { NOT_PRODUCT( const TraceMemPointer& _trace; ) const MemNode* _mem; - const MemPointer _decomposed_form; + const MemPointer _mem_pointer; public: MemPointerX(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), - _decomposed_form(init_decomposed_form()) + _mem_pointer(init_mem_pointer()) { #ifndef PRODUCT if (_trace.is_trace_pointer()) { tty->print_cr("MemPointer::MemPointer:"); tty->print("mem: "); mem->dump(); _mem->in(MemNode::Address)->dump_bfs(5, 0, "d"); - _decomposed_form.print_on(tty); + _mem_pointer.print_on(tty); } #endif } bool is_adjacent_to_and_before(const MemPointerX& other) const { - return decomposed_form().is_adjacent_to_and_before(other.decomposed_form()); + return mem_pointer().is_adjacent_to_and_before(other.mem_pointer()); } private: - const MemPointer decomposed_form() const { return _decomposed_form; } + const MemPointer mem_pointer() const { return _mem_pointer; } - const MemPointer init_decomposed_form() { + const MemPointer init_mem_pointer() { assert(_mem->is_Store(), "only stores are supported"); ResourceMark rm; MemPointerParser parser(_mem NOT_PRODUCT(COMMA _trace)); - return parser.decomposed_form(); + return parser.mem_pointer(); } }; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 61703fc2a8152..fb35ba415e1e3 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -513,8 +513,8 @@ int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); // VPointer summands - return MemPointer::cmp_summands(a->vpointer().decomposed_form(), - b->vpointer().decomposed_form()); + return MemPointer::cmp_summands(a->vpointer().mem_pointer(), + b->vpointer().mem_pointer()); } int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { @@ -523,8 +523,8 @@ int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { if (cmp_group != 0) { return cmp_group; } // VPointer con - jint a_con = a->vpointer().decomposed_form().con().value(); - jint b_con = b->vpointer().decomposed_form().con().value(); + jint a_con = a->vpointer().mem_pointer().con().value(); + jint b_con = b->vpointer().mem_pointer().con().value(); RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); return 0; @@ -2840,8 +2840,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { const int iv_stride = this->iv_stride(); const int iv_scale = p.iv_scale(); const int con = p.con(); - Node* base = p.decomposed_form().base().object_or_native(); - bool is_base_native = p.decomposed_form().base().is_native(); + Node* base = p.mem_pointer().base().object_or_native(); + bool is_base_native = p.mem_pointer().base().is_native(); #ifdef ASSERT if (_trace._align_vector) { diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 358c7f05b83cc..2888a032a8858 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -403,9 +403,9 @@ void VPointer::print_on(outputStream* st) const { } st->print("size: %2d, base: ", _size); - _decomposed_form.base().print_on(st); + _mem_pointer.base().print_on(st); st->print(", form: "); - _decomposed_form.print_form_on(st); + _mem_pointer.print_form_on(st); st->print(", invar_summands: "); for_each_invar_summand([&] (const MemPointerSummand& s) { s.print_on(tty); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 76acca11db395..5486fd46621b5 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -705,7 +705,7 @@ class VPointer : public ArenaObj { typedef MemPointerParser::Callback Callback; const VLoop& _vloop; - const MemPointer _decomposed_form; + const MemPointer _mem_pointer; const jint _size; // Derived, for quicker use. @@ -717,7 +717,7 @@ class VPointer : public ArenaObj { template VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _vloop(vloop), - _decomposed_form(init_decomposed_form(mem, adr_node_callback, vloop)), + _mem_pointer(init_mem_pointer(mem, adr_node_callback, vloop)), _size(mem->memory_size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) @@ -733,17 +733,17 @@ class VPointer : public ArenaObj { } // Accessors - bool is_valid() const { return _is_valid; } - const MemPointer& decomposed_form() const { assert(_is_valid, ""); return _decomposed_form; } - jint size() const { assert(_is_valid, ""); return _size; } - jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } - jint con() const { return decomposed_form().con().value(); } + bool is_valid() const { return _is_valid; } + const MemPointer& mem_pointer() const { assert(_is_valid, ""); return _mem_pointer; } + jint size() const { assert(_is_valid, ""); return _size; } + jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } + jint con() const { return mem_pointer().con().value(); } template void for_each_invar_summand(Callback callback) const { - decomposed_form().for_each_non_empty_summand([&] (const MemPointerSummand& s) { + mem_pointer().for_each_non_empty_summand([&] (const MemPointerSummand& s) { Node* variable = s.variable(); - if (variable != decomposed_form().base().object_or_native() && + if (variable != mem_pointer().base().object_or_native() && is_invariant(variable, _vloop)) { callback(s); } @@ -777,11 +777,11 @@ class VPointer : public ArenaObj { bool has_same_invar_and_iv_scale_as(const VPointer& other) const { // If we have the same invar_summands, and the same iv summand with the same iv_scale, // then all summands except the base must be the same. - return decomposed_form().has_same_non_base_summands_as(other.decomposed_form()); + return mem_pointer().has_same_non_base_summands_as(other.mem_pointer()); } bool is_adjacent_to_and_before(const VPointer& other) const { - return decomposed_form().is_adjacent_to_and_before(other.decomposed_form()); + return mem_pointer().is_adjacent_to_and_before(other.mem_pointer()); } bool never_overlaps_with(const VPointer& other) const { @@ -793,7 +793,7 @@ class VPointer : public ArenaObj { #endif return false; } - return decomposed_form().never_overlaps_with(other.decomposed_form()); + return mem_pointer().never_overlaps_with(other.mem_pointer()); } bool overlap_possible_with_any_in(const GrowableArray& nodes) const { @@ -811,20 +811,20 @@ class VPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - static const MemPointer init_decomposed_form(const MemNode* mem, - Callback& adr_node_callback, - const VLoop& vloop) { + static const MemPointer init_mem_pointer(const MemNode* mem, + Callback& adr_node_callback, + const VLoop& vloop) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); ResourceMark rm; MemPointerParser parser(mem, adr_node_callback NOT_PRODUCT(COMMA vloop.mptrace())); - return parser.decomposed_form(); + return parser.mem_pointer(); } jint init_iv_scale() const { for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { - const MemPointerSummand& summand = _decomposed_form.summands_at(i); + const MemPointerSummand& summand = _mem_pointer.summands_at(i); Node* variable = summand.variable(); if (variable == _vloop.iv()) { return summand.scale().value(); @@ -837,7 +837,7 @@ class VPointer : public ArenaObj { // Check that all variables are either the iv, or else invariants. // TODO why pre-loop bool init_is_valid() const { - if (!_decomposed_form.base().is_known()) { + if (!_mem_pointer.base().is_known()) { // VPointer needs to know if it is native (off-heap) or object (on-heap). // We may for example have failed to fully decompose the MemPointer, possibly // because such a decomposition is not considered safe. @@ -845,7 +845,7 @@ class VPointer : public ArenaObj { } for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { - const MemPointerSummand& summand = _decomposed_form.summands_at(i); + const MemPointerSummand& summand = _mem_pointer.summands_at(i); Node* variable = summand.variable(); if (variable != nullptr && variable != _vloop.iv() && !is_invariant(variable, _vloop)) { return false; @@ -1265,7 +1265,7 @@ class AlignmentSolver { AlignmentSolution* solve() const; private: - MemPointer::Base base() const { return _vpointer.decomposed_form().base();} + MemPointer::Base base() const { return _vpointer.mem_pointer().base();} jint iv_scale() const { return _vpointer.iv_scale(); } class EQ4 { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 1f95166b19b4e..95432f87ab2ed 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -170,7 +170,7 @@ class VMemoryRegion : public StackObj { VMemoryRegion() : _vpointer(nullptr) {} // empty constructor for GrowableArray VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, bool is_load, uint schedule_order) : _vpointer(&vpointer), - _base( vpointer.decomposed_form().base().object_or_native()), + _base( vpointer.mem_pointer().base().object_or_native()), _scale( vpointer.iv_scale()), _invar( nullptr), // TODO _offset( vpointer.con() + _scale * iv_offset), @@ -225,7 +225,7 @@ class VMemoryRegion : public StackObj { void print() const { tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), ", _is_load ? "load " : "store", _memory_size, _schedule_order); - _vpointer->decomposed_form().print_on(tty, false); + _vpointer->mem_pointer().print_on(tty, false); tty->print_cr("]"); } #endif From ad82ff31332ea8a641704fd93b1b3c3694e90917 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 14:43:30 +0100 Subject: [PATCH 065/130] move parsing --- src/hotspot/share/opto/mempointer.hpp | 13 +++++++++++-- src/hotspot/share/opto/vectorization.hpp | 15 +++------------ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index b812574c5d252..449e341f2b1aa 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -348,7 +348,6 @@ // This shows that p1 and p2 have a distance greater than the array size, and hence at least one of the two // pointers must be out of bounds. This contradicts our assumption (S1) and we are done. - #ifndef PRODUCT class TraceMemPointer : public StackObj { private: @@ -749,7 +748,7 @@ class MemPointerParser : public StackObj { public: // No callback. MemPointerParser(const MemNode* mem - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), @@ -764,6 +763,16 @@ class MemPointerParser : public StackObj { _con(NoOverflowInt(0)), _mem_pointer(parse(adr_node_callback)) {} + static MemPointer parse(const MemNode* mem, + Callback& adr_node_callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { + assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); + ResourceMark rm; + MemPointerParser parser(mem NOT_PRODUCT(COMMA trace)); + return parser.mem_pointer(); + } + + // TODO rm / private? const MemPointer& mem_pointer() const { return _mem_pointer; } private: diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 5486fd46621b5..1a090bd48e546 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -717,7 +717,9 @@ class VPointer : public ArenaObj { template VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : _vloop(vloop), - _mem_pointer(init_mem_pointer(mem, adr_node_callback, vloop)), + _mem_pointer(MemPointerParser::parse(mem, + adr_node_callback + NOT_PRODUCT(COMMA vloop.mptrace()))), _size(mem->memory_size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) @@ -811,17 +813,6 @@ class VPointer : public ArenaObj { NOT_PRODUCT( void print_on(outputStream* st) const; ) private: - static const MemPointer init_mem_pointer(const MemNode* mem, - Callback& adr_node_callback, - const VLoop& vloop) { - assert(mem->is_Store() || mem->is_Load(), "only stores and loads are supported"); - ResourceMark rm; - MemPointerParser parser(mem, - adr_node_callback - NOT_PRODUCT(COMMA vloop.mptrace())); - return parser.mem_pointer(); - } - jint init_iv_scale() const { for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _mem_pointer.summands_at(i); From 1c861a746b4ce173542e75a79b1abe2e209a1940 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 14:55:39 +0100 Subject: [PATCH 066/130] rm old MemPointer --- src/hotspot/share/opto/memnode.cpp | 5 +-- src/hotspot/share/opto/mempointer.hpp | 45 ++------------------------- 2 files changed, 5 insertions(+), 45 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 4b45e95c1dae1..b1be00adb4a6e 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,8 +2947,9 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_adjacency(), true); #endif - const MemPointerX pointer_use(use_store NOT_PRODUCT( COMMA trace )); - const MemPointerX pointer_def(def_store NOT_PRODUCT( COMMA trace )); + MemPointerParser::Callback empty_callback; // TODO rm? + const MemPointer pointer_use(MemPointerParser::parse(use_store, empty_callback NOT_PRODUCT( COMMA trace ))); + const MemPointer pointer_def(MemPointerParser::parse(def_store, empty_callback NOT_PRODUCT( COMMA trace ))); return pointer_def.is_adjacent_to_and_before(pointer_use); } diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 449e341f2b1aa..e0d877b98bfa5 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -351,8 +351,8 @@ #ifndef PRODUCT class TraceMemPointer : public StackObj { private: - // TODO rename and possibly extend, also rename tags - const bool _is_trace_pointer; + // TODO rename and possibly extend, also rename tags, check where apply + const bool _is_trace_pointer; // TODO parse pointer? and check where to add! const bool _is_trace_aliasing; const bool _is_trace_adjacency; const bool _is_trace_overlap; @@ -783,45 +783,4 @@ class MemPointerParser : public StackObj { bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; }; -// TODO maybe merge with decomposed form? -// Facility to parse the pointer of a Load or Store, so that aliasing between two such -// memory operations can be determined (e.g. adjacency). -class MemPointerX : public StackObj { -private: - NOT_PRODUCT( const TraceMemPointer& _trace; ) - - const MemNode* _mem; - const MemPointer _mem_pointer; - -public: - MemPointerX(const MemNode* mem NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) - _mem(mem), - _mem_pointer(init_mem_pointer()) - { -#ifndef PRODUCT - if (_trace.is_trace_pointer()) { - tty->print_cr("MemPointer::MemPointer:"); - tty->print("mem: "); mem->dump(); - _mem->in(MemNode::Address)->dump_bfs(5, 0, "d"); - _mem_pointer.print_on(tty); - } -#endif - } - - bool is_adjacent_to_and_before(const MemPointerX& other) const { - return mem_pointer().is_adjacent_to_and_before(other.mem_pointer()); - } - -private: - const MemPointer mem_pointer() const { return _mem_pointer; } - - const MemPointer init_mem_pointer() { - assert(_mem->is_Store(), "only stores are supported"); - ResourceMark rm; - MemPointerParser parser(_mem NOT_PRODUCT(COMMA _trace)); - return parser.mem_pointer(); - } -}; - #endif // SHARE_OPTO_MEMPOINTER_HPP From 32416577d496ac849cd2bbf450a9016a6a615f8b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 16:00:39 +0100 Subject: [PATCH 067/130] unify naming of trace --- src/hotspot/share/opto/memnode.cpp | 18 +++++++++--------- src/hotspot/share/opto/mempointer.hpp | 8 ++++---- .../share/opto/traceAutoVectorizationTag.hpp | 8 ++++---- src/hotspot/share/opto/traceMergeStoresTag.hpp | 6 +++--- src/hotspot/share/opto/vectorization.hpp | 10 +++++----- 5 files changed, 25 insertions(+), 25 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index b1be00adb4a6e..426257555e012 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2862,16 +2862,16 @@ class MergePrimitiveStores : public StackObj { return is_trace(TraceMergeStores::Tag::BASIC); } - bool is_trace_pointer() const { - return is_trace(TraceMergeStores::Tag::POINTER); + bool is_trace_pointer_parsing() const { + return is_trace(TraceMergeStores::Tag::POINTER_PARSING); } - bool is_trace_aliasing() const { - return is_trace(TraceMergeStores::Tag::ALIASING); + bool is_trace_pointer_aliasing() const { + return is_trace(TraceMergeStores::Tag::POINTER_ALIASING); } - bool is_trace_adjacency() const { - return is_trace(TraceMergeStores::Tag::ADJACENCY); + bool is_trace_pointer_adjacency() const { + return is_trace(TraceMergeStores::Tag::POINTER_ADJACENCY); } bool is_trace_success() const { @@ -2942,9 +2942,9 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St ResourceMark rm; #ifndef PRODUCT - const TraceMemPointer trace(is_trace_pointer(), - is_trace_aliasing(), - is_trace_adjacency(), + const TraceMemPointer trace(is_trace_pointer_parsing(), + is_trace_pointer_aliasing(), + is_trace_pointer_adjacency(), true); #endif MemPointerParser::Callback empty_callback; // TODO rm? diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index e0d877b98bfa5..20910740a1471 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -352,23 +352,23 @@ class TraceMemPointer : public StackObj { private: // TODO rename and possibly extend, also rename tags, check where apply - const bool _is_trace_pointer; // TODO parse pointer? and check where to add! + const bool _is_trace_parsing; // TODO parse pointer? and check where to add! const bool _is_trace_aliasing; const bool _is_trace_adjacency; const bool _is_trace_overlap; public: - TraceMemPointer(const bool is_trace_pointer, + TraceMemPointer(const bool is_trace_parsing, const bool is_trace_aliasing, const bool is_trace_adjacency, const bool is_trace_overlap) : - _is_trace_pointer( is_trace_pointer), + _is_trace_parsing( is_trace_parsing), _is_trace_aliasing( is_trace_aliasing), _is_trace_adjacency(is_trace_adjacency), _is_trace_overlap(is_trace_overlap) {} - bool is_trace_pointer() const { return _is_trace_pointer; } + bool is_trace_parsing() const { return _is_trace_parsing; } bool is_trace_aliasing() const { return _is_trace_aliasing; } bool is_trace_adjacency() const { return _is_trace_adjacency; } bool is_trace_overlap() const { return _is_trace_overlap; } diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 2c1fe6efb0de1..5d7f0875ef9dd 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -29,10 +29,10 @@ #include "utilities/stringUtils.hpp" #define COMPILER_TRACE_AUTO_VECTORIZATION_TAG(flags) \ - flags(POINTER, "Trace VPointer construction / parsing") \ - flags(ALIASING, "Trace VPointer aliasing") \ - flags(ADJACENCY, "Trace VPointer adjacency") \ - flags(OVERLAP, "Trace VPointer overlap") \ + flags(POINTER_PARSING, "Trace VPointer/MemPointer parsing") \ + flags(POINTER_ALIASING, "Trace VPointer/MemPointer aliasing") \ + flags(POINTER_ADJACENCY, "Trace VPointer/MemPointer adjacency") \ + flags(POINTER_OVERLAP, "Trace VPointer/MemPointer overlap") \ flags(PRECONDITIONS, "Trace VLoop::check_preconditions") \ flags(LOOP_ANALYZER, "Trace VLoopAnalyzer::setup_submodules") \ flags(MEMORY_SLICES, "Trace VLoopMemorySlices") \ diff --git a/src/hotspot/share/opto/traceMergeStoresTag.hpp b/src/hotspot/share/opto/traceMergeStoresTag.hpp index 9f33c9efa0525..68969cd5dc0d0 100644 --- a/src/hotspot/share/opto/traceMergeStoresTag.hpp +++ b/src/hotspot/share/opto/traceMergeStoresTag.hpp @@ -31,9 +31,9 @@ namespace TraceMergeStores { #define COMPILER_TAG(flags) \ flags(BASIC, "Trace basic analysis steps") \ - flags(POINTER, "Trace pointer IR") \ - flags(ALIASING, "Trace MemPointerSimpleForm::get_aliasing_with") \ - flags(ADJACENCY, "Trace adjacency") \ + flags(POINTER_PARSING, "Trace pointer IR") \ + flags(POINTER_ALIASING, "Trace MemPointerSimpleForm::get_aliasing_with") \ + flags(POINTER_ADJACENCY, "Trace adjacency") \ flags(SUCCESS, "Trace successful merges") \ #define table_entry(name, description) name, diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 1a090bd48e546..5d8c1b7da1329 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -108,10 +108,10 @@ class VLoop : public StackObj { #ifndef PRODUCT COMMA _mptrace(TraceMemPointer( - _vtrace.is_trace(TraceAutoVectorizationTag::POINTER), - _vtrace.is_trace(TraceAutoVectorizationTag::ALIASING), - _vtrace.is_trace(TraceAutoVectorizationTag::ADJACENCY), - _vtrace.is_trace(TraceAutoVectorizationTag::OVERLAP) + _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_PARSING), + _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ALIASING), + _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_ADJACENCY), + _vtrace.is_trace(TraceAutoVectorizationTag::POINTER_OVERLAP) )) #endif {} @@ -725,7 +725,7 @@ class VPointer : public ArenaObj { _is_valid(init_is_valid()) { #ifndef PRODUCT - if (vloop.mptrace().is_trace_pointer()) { + if (vloop.mptrace().is_trace_parsing()) { tty->print_cr("VPointer::VPointer:"); tty->print("mem: "); mem->dump(); print_on(tty); From 86724a40cf7aacae206f3c4ed6fca56fd179ca79 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 16:41:37 +0100 Subject: [PATCH 068/130] better parsing trace --- src/hotspot/share/opto/mempointer.hpp | 13 +++++++++++-- src/hotspot/share/opto/vectorization.hpp | 1 - 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 20910740a1471..e5b6c7b8a94a4 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -351,8 +351,7 @@ #ifndef PRODUCT class TraceMemPointer : public StackObj { private: - // TODO rename and possibly extend, also rename tags, check where apply - const bool _is_trace_parsing; // TODO parse pointer? and check where to add! + const bool _is_trace_parsing; const bool _is_trace_aliasing; const bool _is_trace_adjacency; const bool _is_trace_overlap; @@ -769,6 +768,16 @@ class MemPointerParser : public StackObj { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); ResourceMark rm; MemPointerParser parser(mem NOT_PRODUCT(COMMA trace)); + +#ifndef PRODUCT + if (trace.is_trace_parsing()) { + tty->print_cr("\nMemPointerParser::parse:"); + tty->print(" mem: "); mem->dump(); + parser.mem_pointer().print_on(tty); + mem->in(MemNode::Address)->dump_bfs(7, 0, "d"); + } +#endif + return parser.mem_pointer(); } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 5d8c1b7da1329..8864f06412b7a 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -729,7 +729,6 @@ class VPointer : public ArenaObj { tty->print_cr("VPointer::VPointer:"); tty->print("mem: "); mem->dump(); print_on(tty); - mem->in(MemNode::Address)->dump_bfs(7, 0, "d"); } #endif } From cc0d79e8690aec75cc11c7d189de8559f129e5ba Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 16:45:44 +0100 Subject: [PATCH 069/130] cleanup --- src/hotspot/share/opto/mempointer.cpp | 1 - src/hotspot/share/opto/mempointer.hpp | 3 +-- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 9453af98097d4..a03af9558b5a2 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -33,7 +33,6 @@ MemPointer MemPointerParser::parse(Callback& adr_node_callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); - // TODO maybe refactor out _mem? Node* pointer = _mem->in(MemNode::Address); const jint size = _mem->memory_size(); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index e5b6c7b8a94a4..fef7997cfcd18 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -781,10 +781,9 @@ class MemPointerParser : public StackObj { return parser.mem_pointer(); } - // TODO rm / private? +private: const MemPointer& mem_pointer() const { return _mem_pointer; } -private: MemPointer parse(Callback& adr_node_callback); void parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback); From 31896bd1eaa7bda109eca1011e17648178dcc490 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 17:14:51 +0100 Subject: [PATCH 070/130] refactor invariant check --- src/hotspot/share/opto/vectorization.hpp | 58 ++++++++++++++++-------- 1 file changed, 38 insertions(+), 20 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 8864f06412b7a..f40a055a462f4 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -185,6 +185,22 @@ class VLoop : public StackObj { return n != nullptr && n->outcnt() > 0 && ctrl == _cl; } + // Some nodes must be pre-loop invariant, so that they can be used for conditions + // before or inside the pre-loop. For example, alignment of main-loop vector + // memops must be acheived in the pre-loop, via the exit check in the pre-loop. + bool is_pre_loop_invariant(Node* n) const { + assert(cl()->is_main_loop(), "must be"); + Node* ctrl = phase()->get_ctrl(n); + + // Quick test: is it in the main-loop? + if (lpt()->is_member(phase()->get_loop(ctrl))) { + return false; + } + + // Is it before the pre-loop? + return phase()->is_dominator(ctrl, pre_loop_head()); + } + // Check if the loop passes some basic preconditions for vectorization. // Return indicates if analysis succeeded. bool check_preconditions(); @@ -745,7 +761,7 @@ class VPointer : public ArenaObj { mem_pointer().for_each_non_empty_summand([&] (const MemPointerSummand& s) { Node* variable = s.variable(); if (variable != mem_pointer().base().object_or_native() && - is_invariant(variable, _vloop)) { + _vloop.is_pre_loop_invariant(variable)) { callback(s); } }); @@ -825,19 +841,33 @@ class VPointer : public ArenaObj { } // Check that all variables are either the iv, or else invariants. - // TODO why pre-loop bool init_is_valid() const { if (!_mem_pointer.base().is_known()) { // VPointer needs to know if it is native (off-heap) or object (on-heap). // We may for example have failed to fully decompose the MemPointer, possibly // because such a decomposition is not considered safe. +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::init_is_valid: base not known."); + } +#endif return false; } + // All summands, except the iv-summand must be pre-loop invariant. This is necessary + // so that we can use the variables in checks inside or before the pre-loop, e.g. for + // alignment. for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _mem_pointer.summands_at(i); Node* variable = summand.variable(); - if (variable != nullptr && variable != _vloop.iv() && !is_invariant(variable, _vloop)) { + if (variable != nullptr && variable != _vloop.iv() && !_vloop.is_pre_loop_invariant(variable)) { +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print("VPointer::init_is_valid: summand is not pre-loop invariant: "); + summand.print_on(tty); + tty->cr(); + } +#endif return false; } } @@ -857,28 +887,16 @@ class VPointer : public ArenaObj { if (abs(long_iv_scale) >= max_val || abs(long_iv_stride) >= max_val || abs(long_iv_scale * long_iv_stride) >= max_val) { +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::init_is_valid: scale or stride too large."); + } +#endif return false; } return true; } - - // TODO refactor to VLoop? - // Is it invariant of the loop, i.e. the main-loop and even the pre-loop? - // The invariants are used for alignment, in the exit check of the pre-loop, - // this is why we need invariance of even the pre-loop. - static bool is_invariant(Node* n, const VLoop& vloop) { - assert(vloop.cl()->is_main_loop(), "must be"); - Node* ctrl = vloop.phase()->get_ctrl(n); - - // Quick test: is it in the main-loop? - if (vloop.lpt()->is_member(vloop.phase()->get_loop(ctrl))) { - return false; - } - - // Is it before the pre-loop? - return vloop.phase()->is_dominator(ctrl, vloop.pre_loop_head()); - } }; // Vector element size statistics for loop vectorization with vector masks From 4b73d57c7ca992e4b3d4fa454fa06e96720cfe51 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 18:18:52 +0100 Subject: [PATCH 071/130] renaming --- src/hotspot/share/opto/memnode.cpp | 2 +- src/hotspot/share/opto/mempointer.cpp | 14 ++++++------ src/hotspot/share/opto/mempointer.hpp | 29 +++++++++--------------- src/hotspot/share/opto/superword.cpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 10 ++++---- src/hotspot/share/opto/vtransform.cpp | 2 +- 7 files changed, 26 insertions(+), 35 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 426257555e012..835de6964fac7 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,7 +2947,7 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_pointer_adjacency(), true); #endif - MemPointerParser::Callback empty_callback; // TODO rm? + MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? const MemPointer pointer_use(MemPointerParser::parse(use_store, empty_callback NOT_PRODUCT( COMMA trace ))); const MemPointer pointer_def(MemPointerParser::parse(def_store, empty_callback NOT_PRODUCT( COMMA trace ))); return pointer_def.is_adjacent_to_and_before(pointer_use); diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index a03af9558b5a2..2abe5b6dbfe2c 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -29,7 +29,7 @@ // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointer MemPointerParser::parse(Callback& adr_node_callback) { +MemPointer MemPointerParser::parse(DecomposedNodeCallback& callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); @@ -47,7 +47,7 @@ MemPointer MemPointerParser::parse(Callback& adr_node_callback) { if (traversal_count++ > 1000) { return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); } - parse_sub_expression(_worklist.pop(), adr_node_callback); + parse_sub_expression(_worklist.pop(), callback); } // Bail out if there is a constant overflow. @@ -88,7 +88,7 @@ MemPointer MemPointerParser::parse(Callback& adr_node_callback) { // Parse a sub-expression of the pointer, starting at the current summand. We parse the // current node, and see if it can be decomposed into further summands, or if the current // summand is terminal. -void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback) { +void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, DecomposedNodeCallback& callback) { Node* n = summand.variable(); const NoOverflowInt scale = summand.scale(); const NoOverflowInt one(1); @@ -114,7 +114,7 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Ca Node* b = n->in((opc == Op_AddP) ? 3 : 2); _worklist.push(MemPointerSummand(a, scale)); _worklist.push(MemPointerSummand(b, scale)); - adr_node_callback.callback(n); + callback.callback(n); return; } case Op_SubL: @@ -128,7 +128,7 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Ca _worklist.push(MemPointerSummand(a, scale)); _worklist.push(MemPointerSummand(b, sub_scale)); - adr_node_callback.callback(n); + callback.callback(n); return; } case Op_MulL: @@ -163,7 +163,7 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Ca NoOverflowInt new_scale = scale * factor; _worklist.push(MemPointerSummand(variable, new_scale)); - adr_node_callback.callback(n); + callback.callback(n); return; } case Op_CastII: @@ -181,7 +181,7 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Ca // Decompose: look through. Node* a = n->in(1); _worklist.push(MemPointerSummand(a, scale)); - adr_node_callback.callback(n); + callback.callback(n); return; } case Op_CastX2P: diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index fef7997cfcd18..9bd441a98b98d 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -724,14 +724,16 @@ class MemPointer : public StackObj { class MemPointerParser : public StackObj { public: - class Callback : public StackObj { + // Parsing calls the callback on every decomposed node. These are all the + // nodes on the paths from the pointer to the summand variables, i.e. the + // "inner" nodes of the pointer expression. This callback allows collecting + // all such nodes of a pointer expression. + class DecomposedNodeCallback : public StackObj { public: virtual void callback(Node* n) { /* do nothing by default */ } }; private: - Callback _empty_callback; - NOT_PRODUCT( const TraceMemPointer& _trace; ) const MemNode* _mem; @@ -745,29 +747,20 @@ class MemPointerParser : public StackObj { MemPointer _mem_pointer; public: - // No callback. - MemPointerParser(const MemNode* mem - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) - _mem(mem), - _con(NoOverflowInt(0)), - _mem_pointer(parse(_empty_callback)) {} - - // With callback. MemPointerParser(const MemNode* mem, - Callback& adr_node_callback + DecomposedNodeCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), - _mem_pointer(parse(adr_node_callback)) {} + _mem_pointer(parse(callback)) {} static MemPointer parse(const MemNode* mem, - Callback& adr_node_callback + DecomposedNodeCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); ResourceMark rm; - MemPointerParser parser(mem NOT_PRODUCT(COMMA trace)); + MemPointerParser parser(mem, callback NOT_PRODUCT(COMMA trace)); #ifndef PRODUCT if (trace.is_trace_parsing()) { @@ -784,9 +777,9 @@ class MemPointerParser : public StackObj { private: const MemPointer& mem_pointer() const { return _mem_pointer; } - MemPointer parse(Callback& adr_node_callback); + MemPointer parse(DecomposedNodeCallback& callback); - void parse_sub_expression(const MemPointerSummand& summand, Callback& adr_node_callback); + void parse_sub_expression(const MemPointerSummand& summand, DecomposedNodeCallback& callback); bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; }; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index fb35ba415e1e3..93c0225aa5e66 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -50,7 +50,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : } // Collect ignored loop nodes during VPointer parsing. -class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParser::Callback { +class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParser::DecomposedNodeCallback { private: const VLoop& _vloop; const Node_List& _body; diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 2888a032a8858..1243e84b2b776 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -190,7 +190,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - MemPointerParser::Callback empty_callback; // TODO rm? + MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, empty_callback); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index f40a055a462f4..224bf75e4edb3 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -718,7 +718,7 @@ class VLoopAnalyzer : StackObj { // class VPointer : public ArenaObj { private: - typedef MemPointerParser::Callback Callback; + typedef MemPointerParser::DecomposedNodeCallback DecomposedNodeCallback; const VLoop& _vloop; const MemPointer _mem_pointer; @@ -730,11 +730,10 @@ class VPointer : public ArenaObj { const bool _is_valid; public: - template - VPointer(const MemNode* mem, const VLoop& vloop, Callback& adr_node_callback) : + VPointer(const MemNode* mem, const VLoop& vloop, DecomposedNodeCallback& callback) : _vloop(vloop), _mem_pointer(MemPointerParser::parse(mem, - adr_node_callback + callback NOT_PRODUCT(COMMA vloop.mptrace()))), _size(mem->memory_size()), _iv_scale(init_iv_scale()), @@ -814,7 +813,7 @@ class VPointer : public ArenaObj { } bool overlap_possible_with_any_in(const GrowableArray& nodes) const { - MemPointerParser::Callback empty_callback; // TODO rm? + MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? for (int i = 0; i < nodes.length(); i++) { MemNode* mem = nodes.at(i)->as_Mem(); VPointer mem_p(mem->as_Mem(), _vloop, empty_callback); @@ -1210,7 +1209,6 @@ class AlignmentSolver { private: const VPointer& _vpointer; - // TODO rm? const MemNode* _mem_ref; // first element const int _vector_width; // in bytes diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 95432f87ab2ed..c83757c49968a 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -581,7 +581,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // does not have any memory dependency. while (mem->is_StoreVector()) { // TODO refactor with VPointer for this vector load! - MemPointerParser::Callback empty_callback; // TODO rm? + MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); if (store_p.overlap_possible_with_any_in(nodes())) { break; From 6e1aa80a94de642383ddc8257682ee538f519afd Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 18:39:14 +0100 Subject: [PATCH 072/130] empty callback --- src/hotspot/share/opto/memnode.cpp | 5 ++--- src/hotspot/share/opto/mempointer.cpp | 2 ++ src/hotspot/share/opto/mempointer.hpp | 12 +++++++++--- src/hotspot/share/opto/vectorization.hpp | 6 +++--- 4 files changed, 16 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 835de6964fac7..ca64881b25c7e 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,9 +2947,8 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_pointer_adjacency(), true); #endif - MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? - const MemPointer pointer_use(MemPointerParser::parse(use_store, empty_callback NOT_PRODUCT( COMMA trace ))); - const MemPointer pointer_def(MemPointerParser::parse(def_store, empty_callback NOT_PRODUCT( COMMA trace ))); + const MemPointer pointer_use(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) use_store)); + const MemPointer pointer_def(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) def_store)); return pointer_def.is_adjacent_to_and_before(pointer_use); } diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 2abe5b6dbfe2c..811bf49440880 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -27,6 +27,8 @@ #include "opto/addnode.hpp" #include "utilities/resourceHash.hpp" +NOT_PRODUCT(MemPointerParser::DecomposedNodeCallback MemPointerParser::DecomposedNodeCallback::_empty;) + // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. MemPointer MemPointerParser::parse(DecomposedNodeCallback& callback) { diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 9bd441a98b98d..29fd356737b01 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -729,8 +729,14 @@ class MemPointerParser : public StackObj { // "inner" nodes of the pointer expression. This callback allows collecting // all such nodes of a pointer expression. class DecomposedNodeCallback : public StackObj { + private: + static DecomposedNodeCallback _empty; + public: virtual void callback(Node* n) { /* do nothing by default */ } + + // Singleton for default arguments. + static DecomposedNodeCallback& empty() { return _empty; } }; private: @@ -755,9 +761,9 @@ class MemPointerParser : public StackObj { _con(NoOverflowInt(0)), _mem_pointer(parse(callback)) {} - static MemPointer parse(const MemNode* mem, - DecomposedNodeCallback& callback - NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { + static MemPointer parse(NOT_PRODUCT(const TraceMemPointer& trace COMMA) + const MemNode* mem, + DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); ResourceMark rm; MemPointerParser parser(mem, callback NOT_PRODUCT(COMMA trace)); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 224bf75e4edb3..f39fb4e72dd30 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -732,9 +732,9 @@ class VPointer : public ArenaObj { public: VPointer(const MemNode* mem, const VLoop& vloop, DecomposedNodeCallback& callback) : _vloop(vloop), - _mem_pointer(MemPointerParser::parse(mem, - callback - NOT_PRODUCT(COMMA vloop.mptrace()))), + _mem_pointer(MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) + mem, + callback)), _size(mem->memory_size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) From cb810b4589368b759d84168de29457f979767001 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 18:43:45 +0100 Subject: [PATCH 073/130] rm empty_callback --- src/hotspot/share/opto/vectorization.cpp | 3 +-- src/hotspot/share/opto/vectorization.hpp | 7 ++++--- src/hotspot/share/opto/vtransform.cpp | 3 +-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 1243e84b2b776..cb9e735e81a7d 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -190,8 +190,7 @@ void VLoopVPointers::compute_and_cache_vpointers() { int pointers_idx = 0; _body.for_each_mem([&] (MemNode* const mem, int bb_idx) { // Placement new: construct directly into the array. - MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? - ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop, empty_callback); + ::new (&_vpointers[pointers_idx]) VPointer(mem, _vloop); _bb_idx_to_vpointer.at_put(bb_idx, pointers_idx); pointers_idx++; }); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index f39fb4e72dd30..9007319935fe7 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -730,7 +730,9 @@ class VPointer : public ArenaObj { const bool _is_valid; public: - VPointer(const MemNode* mem, const VLoop& vloop, DecomposedNodeCallback& callback) : + VPointer(const MemNode* mem, + const VLoop& vloop, + DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) : _vloop(vloop), _mem_pointer(MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) mem, @@ -813,10 +815,9 @@ class VPointer : public ArenaObj { } bool overlap_possible_with_any_in(const GrowableArray& nodes) const { - MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? for (int i = 0; i < nodes.length(); i++) { MemNode* mem = nodes.at(i)->as_Mem(); - VPointer mem_p(mem->as_Mem(), _vloop, empty_callback); + VPointer mem_p(mem->as_Mem(), _vloop); if (!never_overlaps_with(mem_p)) { return true; // possible overlap } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index c83757c49968a..e6b9d4077ef03 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -581,8 +581,7 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // does not have any memory dependency. while (mem->is_StoreVector()) { // TODO refactor with VPointer for this vector load! - MemPointerParser::DecomposedNodeCallback empty_callback; // TODO rm? - VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop(), empty_callback); + VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop()); if (store_p.overlap_possible_with_any_in(nodes())) { break; } else { From ba9298fa3f7e5f8bfacdc0adffa2e48ea8cc503f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 19:10:40 +0100 Subject: [PATCH 074/130] fix build --- src/hotspot/share/opto/mempointer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 811bf49440880..85c4538e4b2dd 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -27,7 +27,7 @@ #include "opto/addnode.hpp" #include "utilities/resourceHash.hpp" -NOT_PRODUCT(MemPointerParser::DecomposedNodeCallback MemPointerParser::DecomposedNodeCallback::_empty;) +MemPointerParser::DecomposedNodeCallback MemPointerParser::DecomposedNodeCallback::_empty; // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. From 905bcb138d55a915902f387cd891d672cdb030f6 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 26 Nov 2024 19:30:31 +0100 Subject: [PATCH 075/130] refactor sorting --- src/hotspot/share/opto/vtransform.cpp | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index e6b9d4077ef03..cf1041077b655 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -178,6 +178,7 @@ class VMemoryRegion : public StackObj { _is_load(is_load), _schedule_order(schedule_order) {} + const VPointer& vpointer() const { return *_vpointer; } Node* base() const { return _base; } int scale() const { return _scale; } Node* invar() const { return _invar; } @@ -187,19 +188,17 @@ class VMemoryRegion : public StackObj { uint schedule_order() const { return _schedule_order; } static int cmp_for_sort_by_group(VMemoryRegion* r1, VMemoryRegion* r2) { - RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->base()->_idx, r2->base()->_idx); - RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->scale(), r2->scale()); - int r1_invar_idx = r1->invar() == nullptr ? 0 : r1->invar()->_idx; - int r2_invar_idx = r2->invar() == nullptr ? 0 : r2->invar()->_idx; - RETURN_CMP_VALUE_IF_NOT_EQUAL(r1_invar_idx, r2_invar_idx); - return 0; // equal + // Sort by mem_pointer (base, invar, iv_scale), except for the con. + return MemPointer::cmp_summands(r1->vpointer().mem_pointer(), + r2->vpointer().mem_pointer()); } static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) { int cmp_group = cmp_for_sort_by_group(r1, r2); if (cmp_group != 0) { return cmp_group; } - RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->offset(), r2->offset()); + RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->vpointer().con(), + r2->vpointer().con()); return 0; // equal } From aac5bfa2a4b6bf175cb05a589a2f16e46646b323 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 10:02:47 +0100 Subject: [PATCH 076/130] impl make_with_size --- src/hotspot/share/opto/mempointer.hpp | 18 ++++++++++++++++ src/hotspot/share/opto/vectorization.hpp | 27 +++++++++++++----------- src/hotspot/share/opto/vtransform.cpp | 8 ++++--- 3 files changed, 38 insertions(+), 15 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 29fd356737b01..18a0c22a104ac 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -626,6 +626,20 @@ class MemPointer : public StackObj { assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); } + // Mutated copy. + // The new MemPointer is identical, except it has a different size. + MemPointer(const MemPointer& old, + const jint new_size) : + NOT_PRODUCT(_trace(old._trace) COMMA) + _summands(old._summands), + _con(old.con()), + _base(old.base()), + _size(new_size) + { + // TODO trace + // TODO be careful with mutating con...NaN! + } + public: static MemPointer make_trivial(Node* pointer, const jint size @@ -645,6 +659,10 @@ class MemPointer : public StackObj { } } + MemPointer make_with_size(const jint new_size) const { + return MemPointer(*this, new_size); + }; + private: MemPointerAliasing get_aliasing_with(const MemPointer& other NOT_PRODUCT(COMMA const TraceMemPointer& trace)) const; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 9007319935fe7..0022517095b40 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -722,14 +722,23 @@ class VPointer : public ArenaObj { const VLoop& _vloop; const MemPointer _mem_pointer; - const jint _size; + const jint _size; // TODO rm! // Derived, for quicker use. const jint _iv_scale; const bool _is_valid; + VPointer(const VLoop& vloop, + const MemPointer& mem_pointer) : + _vloop(vloop), + _mem_pointer(mem_pointer), + _size(mem_pointer.size()), + _iv_scale(init_iv_scale()), + _is_valid(init_is_valid()) {} + public: + // TODO refactor VPointer(const MemNode* mem, const VLoop& vloop, DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) : @@ -750,6 +759,11 @@ class VPointer : public ArenaObj { #endif } + VPointer make_with_size(const jint new_size) const { + // TODO trace + return VPointer(_vloop, mem_pointer().make_with_size(new_size)); + } + // Accessors bool is_valid() const { return _is_valid; } const MemPointer& mem_pointer() const { assert(_is_valid, ""); return _mem_pointer; } @@ -814,17 +828,6 @@ class VPointer : public ArenaObj { return mem_pointer().never_overlaps_with(other.mem_pointer()); } - bool overlap_possible_with_any_in(const GrowableArray& nodes) const { - for (int i = 0; i < nodes.length(); i++) { - MemNode* mem = nodes.at(i)->as_Mem(); - VPointer mem_p(mem->as_Mem(), _vloop); - if (!never_overlaps_with(mem_p)) { - return true; // possible overlap - } - } - return false; - } - NOT_PRODUCT( void print_on(outputStream* st) const; ) private: diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index cf1041077b655..595b6c2451e64 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -581,10 +581,12 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop while (mem->is_StoreVector()) { // TODO refactor with VPointer for this vector load! VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop()); - if (store_p.overlap_possible_with_any_in(nodes())) { - break; - } else { + const VPointer& scalar_p = vpointer(vloop_analyzer); + const VPointer load_p(scalar_p.make_with_size(scalar_p.size() * vlen)); + if (store_p.never_overlaps_with(load_p)) { mem = mem->in(MemNode::Memory); + } else { + break; } } From 3c6da4939417461763b357c693e54cc911945594 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 10:19:34 +0100 Subject: [PATCH 077/130] rm unnecessary size field --- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 7 ++----- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index cb9e735e81a7d..1eaa7f2ff18e8 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -401,7 +401,7 @@ void VPointer::print_on(outputStream* st) const { return; } - st->print("size: %2d, base: ", _size); + st->print("size: %2d, base: ", size()); _mem_pointer.base().print_on(st); st->print(", form: "); _mem_pointer.print_form_on(st); diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 0022517095b40..04d50edae34c1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -693,7 +693,7 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// VPointer adapts the MemPointer to the use in a loop: +// VPointer wraps the MemPointer to the use in a loop: // // pointer = SUM(summands) + con // @@ -722,7 +722,6 @@ class VPointer : public ArenaObj { const VLoop& _vloop; const MemPointer _mem_pointer; - const jint _size; // TODO rm! // Derived, for quicker use. const jint _iv_scale; @@ -733,7 +732,6 @@ class VPointer : public ArenaObj { const MemPointer& mem_pointer) : _vloop(vloop), _mem_pointer(mem_pointer), - _size(mem_pointer.size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) {} @@ -746,7 +744,6 @@ class VPointer : public ArenaObj { _mem_pointer(MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) mem, callback)), - _size(mem->memory_size()), _iv_scale(init_iv_scale()), _is_valid(init_is_valid()) { @@ -767,7 +764,7 @@ class VPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } const MemPointer& mem_pointer() const { assert(_is_valid, ""); return _mem_pointer; } - jint size() const { assert(_is_valid, ""); return _size; } + jint size() const { assert(_is_valid, ""); return mem_pointer().size(); } jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } jint con() const { return mem_pointer().con().value(); } From 73ddd9dba58d9fce03a71ad525e78f72bb0e47dd Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 10:24:01 +0100 Subject: [PATCH 078/130] refactor VPointer ctor --- src/hotspot/share/opto/vectorization.hpp | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 04d50edae34c1..2b4bab410ff77 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -736,16 +736,13 @@ class VPointer : public ArenaObj { _is_valid(init_is_valid()) {} public: - // TODO refactor VPointer(const MemNode* mem, const VLoop& vloop, DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) : - _vloop(vloop), - _mem_pointer(MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) - mem, - callback)), - _iv_scale(init_iv_scale()), - _is_valid(init_is_valid()) + VPointer(vloop, + MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) + mem, + callback)) { #ifndef PRODUCT if (vloop.mptrace().is_trace_parsing()) { From 352da428153b595ebfc7cf725152e03c01c2cdd0 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 10:36:11 +0100 Subject: [PATCH 079/130] more trace --- src/hotspot/share/opto/mempointer.hpp | 1 - src/hotspot/share/opto/vectorization.hpp | 11 +++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 18a0c22a104ac..963d26a376602 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -636,7 +636,6 @@ class MemPointer : public StackObj { _base(old.base()), _size(new_size) { - // TODO trace // TODO be careful with mutating con...NaN! } diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 2b4bab410ff77..3194bd702815c 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -754,8 +754,15 @@ class VPointer : public ArenaObj { } VPointer make_with_size(const jint new_size) const { - // TODO trace - return VPointer(_vloop, mem_pointer().make_with_size(new_size)); + const VPointer p(_vloop, mem_pointer().make_with_size(new_size)); +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::make_with_size:"); + tty->print(" old: "); print_on(tty); + tty->print(" new: "); p.print_on(tty); + } +#endif + return p; } // Accessors From b1374e5ff22d87da5b25221e9c1969b39bd03c0c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 11:07:00 +0100 Subject: [PATCH 080/130] route VPointer into MemVector --- src/hotspot/share/opto/superword.cpp | 5 ++- .../share/opto/superwordVTransformBuilder.cpp | 8 +++-- src/hotspot/share/opto/vtransform.hpp | 32 +++++++++++++------ 3 files changed, 30 insertions(+), 15 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 93c0225aa5e66..293013c2ebd89 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2658,10 +2658,9 @@ void VTransform::determine_mem_ref_and_aw_for_main_loop_alignment() { const GrowableArray& vtnodes = _graph.vtnodes(); for (int i = 0; i < vtnodes.length(); i++) { - VTransformVectorNode* vtn = vtnodes.at(i)->isa_Vector(); + VTransformMemVectorNode* vtn = vtnodes.at(i)->isa_MemVector(); if (vtn == nullptr) { continue; } - MemNode* p0 = vtn->nodes().at(0)->isa_Mem(); - if (p0 == nullptr) { continue; } + MemNode* p0 = vtn->nodes().at(0)->as_Mem(); int vw = p0->memory_size() * vtn->nodes().length(); if (vw > max_aw) { diff --git a/src/hotspot/share/opto/superwordVTransformBuilder.cpp b/src/hotspot/share/opto/superwordVTransformBuilder.cpp index 2e32ce28d3ccb..dd1bd2b851f80 100644 --- a/src/hotspot/share/opto/superwordVTransformBuilder.cpp +++ b/src/hotspot/share/opto/superwordVTransformBuilder.cpp @@ -139,9 +139,13 @@ VTransformVectorNode* SuperWordVTransformBuilder::make_vector_vtnode_for_pack(co VTransformVectorNode* vtn = nullptr; if (p0->is_Load()) { - vtn = new (_vtransform.arena()) VTransformLoadVectorNode(_vtransform, pack_size); + const VPointer& scalar_p = _vloop_analyzer.vpointers().vpointer(p0->as_Load()); + const VPointer vector_p(scalar_p.make_with_size(scalar_p.size() * pack_size)); + vtn = new (_vtransform.arena()) VTransformLoadVectorNode(_vtransform, pack_size, vector_p); } else if (p0->is_Store()) { - vtn = new (_vtransform.arena()) VTransformStoreVectorNode(_vtransform, pack_size); + const VPointer& scalar_p = _vloop_analyzer.vpointers().vpointer(p0->as_Store()); + const VPointer vector_p(scalar_p.make_with_size(scalar_p.size() * pack_size)); + vtn = new (_vtransform.arena()) VTransformStoreVectorNode(_vtransform, pack_size, vector_p); } else if (p0->is_Bool()) { VTransformBoolTest kind = _packset.get_bool_test(pack); vtn = new (_vtransform.arena()) VTransformBoolVectorNode(_vtransform, pack_size, kind); diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 9090f68483394..4b5346d5d9deb 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -66,6 +66,7 @@ class VTransformVectorNode; class VTransformElementWiseVectorNode; class VTransformBoolVectorNode; class VTransformReductionVectorNode; +class VTransformMemVectorNode; class VTransformLoadVectorNode; class VTransformStoreVectorNode; @@ -314,6 +315,7 @@ class VTransformNode : public ArenaObj { virtual VTransformElementWiseVectorNode* isa_ElementWiseVector() { return nullptr; } virtual VTransformBoolVectorNode* isa_BoolVector() { return nullptr; } virtual VTransformReductionVectorNode* isa_ReductionVector() { return nullptr; } + virtual VTransformMemVectorNode* isa_MemVector() { return nullptr; } virtual VTransformLoadVectorNode* isa_LoadVector() { return nullptr; } virtual VTransformStoreVectorNode* isa_StoreVector() { return nullptr; } @@ -481,30 +483,40 @@ class VTransformReductionVectorNode : public VTransformVectorNode { NOT_PRODUCT(virtual const char* name() const override { return "ReductionVector"; };) }; -class VTransformLoadVectorNode : public VTransformVectorNode { +class VTransformMemVectorNode : public VTransformVectorNode { +private: + const VPointer _vpointer; // with size of the vector + +public: + VTransformMemVectorNode(VTransform& vtransform, const uint req, uint number_of_nodes, const VPointer& vpointer) : + VTransformVectorNode(vtransform, req, number_of_nodes), + _vpointer(vpointer) {} + + virtual VTransformMemVectorNode* isa_MemVector() override { return this; } + virtual bool is_load_or_store_in_loop() const override { return true; } + virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return _vpointer; } +}; + +class VTransformLoadVectorNode : public VTransformMemVectorNode { public: // req = 3 -> [ctrl, mem, adr] - VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes) : - VTransformVectorNode(vtransform, 3, number_of_nodes) {} + VTransformLoadVectorNode(VTransform& vtransform, uint number_of_nodes, const VPointer& vpointer) : + VTransformMemVectorNode(vtransform, 3, number_of_nodes, vpointer) {} LoadNode::ControlDependency control_dependency() const; virtual VTransformLoadVectorNode* isa_LoadVector() override { return this; } virtual bool is_load_in_loop() const override { return true; } - virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "LoadVector"; };) }; -class VTransformStoreVectorNode : public VTransformVectorNode { +class VTransformStoreVectorNode : public VTransformMemVectorNode { public: // req = 4 -> [ctrl, mem, adr, val] - VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes) : - VTransformVectorNode(vtransform, 4, number_of_nodes) {} + VTransformStoreVectorNode(VTransform& vtransform, uint number_of_nodes, const VPointer& vpointer) : + VTransformMemVectorNode(vtransform, 4, number_of_nodes, vpointer) {} virtual VTransformStoreVectorNode* isa_StoreVector() override { return this; } virtual bool is_load_in_loop() const override { return false; } - virtual bool is_load_or_store_in_loop() const override { return true; } - virtual const VPointer& vpointer(const VLoopAnalyzer& vloop_analyzer) const override { return vloop_analyzer.vpointers().vpointer(nodes().at(0)->as_Mem()); } virtual VTransformApplyResult apply(const VLoopAnalyzer& vloop_analyzer, const GrowableArray& vnode_idx_to_transformed_node) const override; NOT_PRODUCT(virtual const char* name() const override { return "StoreVector"; };) From 27817ac1ee0350d28fa3fee43ab9ed9a001ca662 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 11:20:21 +0100 Subject: [PATCH 081/130] fix some printing --- src/hotspot/share/opto/mempointer.hpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 5 +++-- src/hotspot/share/opto/vectorization.hpp | 2 +- src/hotspot/share/opto/vtransform.cpp | 6 +++--- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 963d26a376602..618e735e37498 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -729,7 +729,7 @@ class MemPointer : public StackObj { } void print_on(outputStream* st, bool end_with_cr = true) const { - st->print("MemPointer[base: "); + st->print("MemPointer[size: %2d, base: ", size()); _base.print_on(st); st->print(", form: "); print_form_on(st); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 1eaa7f2ff18e8..a61f3a3e0b2e5 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -393,7 +393,7 @@ void VLoopDependencyGraph::PredsIterator::next() { } #ifndef PRODUCT -void VPointer::print_on(outputStream* st) const { +void VPointer::print_on(outputStream* st, bool end_with_cr) const { st->print("VPointer["); if (!is_valid()) { @@ -410,7 +410,8 @@ void VPointer::print_on(outputStream* st) const { s.print_on(tty); st->print(","); }); - st->print_cr("]"); + st->print("]"); + if (end_with_cr) { st->cr(); } } #endif diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 3194bd702815c..fc9bb28ef9b53 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -829,7 +829,7 @@ class VPointer : public ArenaObj { return mem_pointer().never_overlaps_with(other.mem_pointer()); } - NOT_PRODUCT( void print_on(outputStream* st) const; ) + NOT_PRODUCT( void print_on(outputStream* st, bool end_with_cr = true) const; ) private: jint init_iv_scale() const { diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 595b6c2451e64..d0b8ec0bac0a4 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -222,9 +222,9 @@ class VMemoryRegion : public StackObj { #ifndef PRODUCT void print() const { - tty->print("VMemoryRegion[%s %dbytes, schedule_order(%4d), ", - _is_load ? "load " : "store", _memory_size, _schedule_order); - _vpointer->mem_pointer().print_on(tty, false); + tty->print("VMemoryRegion[%s schedule_order(%4d), ", + _is_load ? "load, " : "store,", _schedule_order); + _vpointer->print_on(tty, false); tty->print_cr("]"); } #endif From b1460329c057b4998ce076e301731e63cd47b29b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 11:54:59 +0100 Subject: [PATCH 082/130] more refactoring --- src/hotspot/share/opto/vectorization.hpp | 8 +++++ src/hotspot/share/opto/vtransform.cpp | 39 ++++++++---------------- 2 files changed, 20 insertions(+), 27 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index fc9bb28ef9b53..b86f607e48b97 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -765,6 +765,14 @@ class VPointer : public ArenaObj { return p; } + // old_pointer = base + invar + iv_scale * iv + con + // new_pointer = base + invar + iv_scale * (iv + iv_offset) + con + // = base + invar + iv_scale * iv + (con + iv_scale * iv_offset) + VPointer make_with_iv_offset(const jint iv_offset) const { + // TODO + return *this; + } + // Accessors bool is_valid() const { return _is_valid; } const MemPointer& mem_pointer() const { assert(_is_valid, ""); return _mem_pointer; } diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index d0b8ec0bac0a4..027ea9ac6c79a 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -156,34 +156,18 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { class VMemoryRegion : public StackObj { private: const VPointer* _vpointer; // reference not possible, need empty VMemoryRegion constructor for GrowableArray - - // TODO rm? - maybe also fix printing? - Node* _base; - int _scale; - Node* _invar; - int _offset; - uint _memory_size; bool _is_load; // load or store? uint _schedule_order; public: VMemoryRegion() : _vpointer(nullptr) {} // empty constructor for GrowableArray - VMemoryRegion(const VPointer& vpointer, int iv_offset, int vector_length, bool is_load, uint schedule_order) : + VMemoryRegion(const VPointer& vpointer, bool is_load, uint schedule_order) : + // TODO need to copy, otherwise reference item on stack!!! _vpointer(&vpointer), - _base( vpointer.mem_pointer().base().object_or_native()), - _scale( vpointer.iv_scale()), - _invar( nullptr), // TODO - _offset( vpointer.con() + _scale * iv_offset), - _memory_size(vpointer.size() * vector_length), _is_load(is_load), _schedule_order(schedule_order) {} const VPointer& vpointer() const { return *_vpointer; } - Node* base() const { return _base; } - int scale() const { return _scale; } - Node* invar() const { return _invar; } - int offset() const { return _offset; } - uint memory_size() const { return _memory_size; } bool is_load() const { return _is_load; } uint schedule_order() const { return _schedule_order; } @@ -209,14 +193,14 @@ class VMemoryRegion : public StackObj { VMemoryRegion* p2 = &other; if (cmp_for_sort_by_group(p1, p2) != 0) { return DIFFERENT_GROUP; } - jlong offset1 = p1->offset(); - jlong offset2 = p2->offset(); - jlong memory_size1 = p1->memory_size(); - jlong memory_size2 = p2->memory_size(); + jlong con1 = p1->vpointer().con(); + jlong con2 = p2->vpointer().con(); + jlong size1 = p1->vpointer().size(); + jlong size2 = p2->vpointer().size(); - if (offset1 >= offset2 + memory_size2) { return AFTER; } - if (offset2 >= offset1 + memory_size1) { return BEFORE; } - if (offset1 == offset2 && memory_size1 == memory_size2) { return EXACT_OVERLAP; } + if (con1 >= con2 + size2) { return AFTER; } + if (con2 >= con1 + size1) { return BEFORE; } + if (con1 == con2 && size1 == size2) { return EXACT_OVERLAP; } return PARTIAL_OVERLAP; } @@ -224,7 +208,7 @@ class VMemoryRegion : public StackObj { void print() const { tty->print("VMemoryRegion[%s schedule_order(%4d), ", _is_load ? "load, " : "store,", _schedule_order); - _vpointer->print_on(tty, false); + vpointer().print_on(tty, false); tty->print_cr("]"); } #endif @@ -357,7 +341,8 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& VTransformVectorNode* vector = vtn->isa_Vector(); uint vector_length = vector != nullptr ? vector->nodes().length() : 1; bool is_load = vtn->is_load_in_loop(); - memory_regions.push(VMemoryRegion(p, iv_offset, vector_length, is_load, schedule_order++)); + const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset)); + memory_regions.push(VMemoryRegion(iv_offset_p, is_load, schedule_order++)); } } } From 262d0fb5598fa46d7a106965135d9838daeb308a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 12:04:51 +0100 Subject: [PATCH 083/130] resource allocate VMemoryRegion --- src/hotspot/share/opto/vtransform.cpp | 34 +++++++++++++++------------ 1 file changed, 19 insertions(+), 15 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 027ea9ac6c79a..314c11e8b43b3 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -153,21 +153,24 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { // It represents a memory region: // [adr, adr + memory_size) // adr = base + invar + iv_scale * iv + con -class VMemoryRegion : public StackObj { +// TODO comment +class VMemoryRegion : public ResourceObj { private: - const VPointer* _vpointer; // reference not possible, need empty VMemoryRegion constructor for GrowableArray + // Note: VPointer has no default constructor, so we cannot use VMemoryRegion + // in-place in a GrowableArray. Hence, we make VMemoryRegion a resource + // allocated object, so the GrowableArray of VMemoryRegion* has a default + // nullptr element. + const VPointer _vpointer; bool _is_load; // load or store? uint _schedule_order; public: - VMemoryRegion() : _vpointer(nullptr) {} // empty constructor for GrowableArray VMemoryRegion(const VPointer& vpointer, bool is_load, uint schedule_order) : - // TODO need to copy, otherwise reference item on stack!!! - _vpointer(&vpointer), + _vpointer(vpointer), _is_load(is_load), _schedule_order(schedule_order) {} - const VPointer& vpointer() const { return *_vpointer; } + const VPointer& vpointer() const { return _vpointer; } bool is_load() const { return _is_load; } uint schedule_order() const { return _schedule_order; } @@ -177,12 +180,12 @@ class VMemoryRegion : public StackObj { r2->vpointer().mem_pointer()); } - static int cmp_for_sort(VMemoryRegion* r1, VMemoryRegion* r2) { - int cmp_group = cmp_for_sort_by_group(r1, r2); + static int cmp_for_sort(VMemoryRegion** r1, VMemoryRegion** r2) { + int cmp_group = cmp_for_sort_by_group(*r1, *r2); if (cmp_group != 0) { return cmp_group; } - RETURN_CMP_VALUE_IF_NOT_EQUAL(r1->vpointer().con(), - r2->vpointer().con()); + RETURN_CMP_VALUE_IF_NOT_EQUAL((*r1)->vpointer().con(), + (*r2)->vpointer().con()); return 0; // equal } @@ -315,7 +318,8 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& // Collect all pointers for scalar and vector loads/stores. ResourceMark rm; - GrowableArray memory_regions; + // Use pointers because no default constructor for elements available. + GrowableArray memory_regions; // To detect store-to-load-forwarding failures at the iteration threshold or below, we // simulate a super-unrolling to reach SuperWordStoreToLoadForwardingFailureDetection @@ -342,7 +346,7 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& uint vector_length = vector != nullptr ? vector->nodes().length() : 1; bool is_load = vtn->is_load_in_loop(); const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset)); - memory_regions.push(VMemoryRegion(iv_offset_p, is_load, schedule_order++)); + memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++)); } } } @@ -357,7 +361,7 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& tty->print_cr(" simulated_unrolling_count = %d", simulated_unrolling_count); tty->print_cr(" simulated_super_unrolling_count = %d", simulated_super_unrolling_count); for (int i = 0; i < memory_regions.length(); i++) { - VMemoryRegion& region = memory_regions.at(i); + VMemoryRegion& region = *memory_regions.at(i); region.print(); } } @@ -365,10 +369,10 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& // For all pairs of pointers in the same group, check if they have a partial overlap. for (int i = 0; i < memory_regions.length(); i++) { - VMemoryRegion& region1 = memory_regions.at(i); + VMemoryRegion& region1 = *memory_regions.at(i); for (int j = i + 1; j < memory_regions.length(); j++) { - VMemoryRegion& region2 = memory_regions.at(j); + VMemoryRegion& region2 = *memory_regions.at(j); const VMemoryRegion::Aliasing aliasing = region1.aliasing(region2); if (aliasing == VMemoryRegion::Aliasing::DIFFERENT_GROUP || From 433163cb20358e3e74aa8efbdb95edd1e28f03f3 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 12:07:37 +0100 Subject: [PATCH 084/130] fix comments --- src/hotspot/share/opto/vtransform.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 314c11e8b43b3..6408a7365b17a 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -150,10 +150,10 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { if (a > b) { return 1; } // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. -// It represents a memory region: -// [adr, adr + memory_size) -// adr = base + invar + iv_scale * iv + con -// TODO comment +// It wraps a VPointer. The VPointer have an iv_offset applied, which +// simulates a virtual unrolling. They represent the memory region: +// [adr, adr + size) +// adr = base + invar + iv_scale * (iv + iv_offset) + con class VMemoryRegion : public ResourceObj { private: // Note: VPointer has no default constructor, so we cannot use VMemoryRegion From 0d6f13a11a2a14d817456115869aef5fcdd85768 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 12:36:07 +0100 Subject: [PATCH 085/130] make_with_iv_offset impl --- src/hotspot/share/opto/mempointer.hpp | 13 ++++++++---- src/hotspot/share/opto/vectorization.hpp | 25 ++++++++++++++++++++---- 2 files changed, 30 insertions(+), 8 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 618e735e37498..116a2d531832c 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -627,16 +627,17 @@ class MemPointer : public StackObj { } // Mutated copy. - // The new MemPointer is identical, except it has a different size. + // The new MemPointer is identical, except it has a different size and con. MemPointer(const MemPointer& old, + const NoOverflowInt new_con, const jint new_size) : NOT_PRODUCT(_trace(old._trace) COMMA) _summands(old._summands), - _con(old.con()), + _con(new_con), _base(old.base()), _size(new_size) { - // TODO be careful with mutating con...NaN! + assert(!_con.is_NaN(), "non-NaN constant"); } public: @@ -659,7 +660,11 @@ class MemPointer : public StackObj { } MemPointer make_with_size(const jint new_size) const { - return MemPointer(*this, new_size); + return MemPointer(*this, this->con(), new_size); + }; + + MemPointer make_with_con(const NoOverflowInt new_con) const { + return MemPointer(*this, new_con, this->size()); }; private: diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index b86f607e48b97..582a59a7f6fbd 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -729,11 +729,12 @@ class VPointer : public ArenaObj { const bool _is_valid; VPointer(const VLoop& vloop, - const MemPointer& mem_pointer) : + const MemPointer& mem_pointer, + const bool must_be_invalid = false) : _vloop(vloop), _mem_pointer(mem_pointer), _iv_scale(init_iv_scale()), - _is_valid(init_is_valid()) {} + _is_valid(!must_be_invalid && init_is_valid()) {} public: VPointer(const MemNode* mem, @@ -769,8 +770,24 @@ class VPointer : public ArenaObj { // new_pointer = base + invar + iv_scale * (iv + iv_offset) + con // = base + invar + iv_scale * iv + (con + iv_scale * iv_offset) VPointer make_with_iv_offset(const jint iv_offset) const { - // TODO - return *this; + NoOverflowInt new_con = NoOverflowInt(con()) + NoOverflowInt(iv_scale()) * NoOverflowInt(iv_offset); + if (new_con.is_NaN()) { + assert("false", "TODO find a case"); // TODO + return make_invalid(); + } + const VPointer p(_vloop, mem_pointer().make_with_con(new_con)); +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::make_with_iv_offset:"); + tty->print(" old: "); print_on(tty); + tty->print(" new: "); p.print_on(tty); + } +#endif + return p; + } + + VPointer make_invalid() const { + return VPointer(_vloop, mem_pointer(), true /* must be invalid*/); } // Accessors From b9a3cba6cb59e845adc9ef68393a5e9fdce7da5c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 14:13:59 +0100 Subject: [PATCH 086/130] cleanup and fix --- src/hotspot/share/opto/vtransform.cpp | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 6408a7365b17a..ec9ca41e54a1a 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -343,7 +343,6 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& const VPointer& p = vtn->vpointer(vloop_analyzer); if (p.is_valid()) { VTransformVectorNode* vector = vtn->isa_Vector(); - uint vector_length = vector != nullptr ? vector->nodes().length() : 1; bool is_load = vtn->is_load_in_loop(); const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset)); memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++)); @@ -567,11 +566,9 @@ VTransformApplyResult VTransformLoadVectorNode::apply(const VLoopAnalyzer& vloop // Set the memory dependency of the LoadVector as early as possible. // Walk up the memory chain, and ignore any StoreVector that provably // does not have any memory dependency. + const VPointer& load_p = vpointer(vloop_analyzer); while (mem->is_StoreVector()) { - // TODO refactor with VPointer for this vector load! VPointer store_p(mem->as_Mem(), vloop_analyzer.vloop()); - const VPointer& scalar_p = vpointer(vloop_analyzer); - const VPointer load_p(scalar_p.make_with_size(scalar_p.size() * vlen)); if (store_p.never_overlaps_with(load_p)) { mem = mem->in(MemNode::Memory); } else { From c7d84444f376facd973d251cb0ac5829bd6ff364 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 14:41:23 +0100 Subject: [PATCH 087/130] add test for con overflow in SuperWordStoreToLoadForwardingFailureDetection --- .../superword/TestLargeScaleAndStride.java | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java index b3453c24d7783..7811d31cd8ef0 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java @@ -38,6 +38,14 @@ * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+AlignVector compiler.loopopts.superword.TestLargeScaleAndStride */ +/* + * @test id=StoreToLoadForwardingFailureDetection + * @bug 8328938 + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:SuperWordStoreToLoadForwardingFailureDetection=4096 compiler.loopopts.superword.TestLargeScaleAndStride + */ + package compiler.loopopts.superword; import jdk.internal.misc.Unsafe; @@ -48,6 +56,7 @@ public class TestLargeScaleAndStride { public static void main(String[] args) { byte[] a = new byte[100]; + byte[] b = new byte[RANGE]; fill(a); byte[] gold1a = a.clone(); @@ -57,6 +66,7 @@ public static void main(String[] args) { byte[] gold2c = a.clone(); byte[] gold2d = a.clone(); byte[] gold3 = a.clone(); + byte[] gold4 = b.clone(); test1a(gold1a); test1b(gold1b); test2a(gold2a); @@ -64,6 +74,7 @@ public static void main(String[] args) { test2c(gold2c); test2d(gold2d); test3(gold3); + test4(gold4); for (int i = 0; i < 100; i++) { byte[] c = a.clone(); @@ -106,6 +117,12 @@ public static void main(String[] args) { test3(c); verify(c, gold3); } + + for (int i = 0; i < 100; i++) { + byte[] c = b.clone(); + test4(c); + verify(c, gold4); + } } static void fill(byte[] a) { @@ -249,4 +266,17 @@ static void test3(byte[] a) { UNSAFE.putByte(a, base + (int)(j + 3), (byte)(v3 + 1)); } } + + // VPointer con overflow possible with large SuperWordStoreToLoadForwardingFailureDetection + static final int test4_BIG = (1 << 31)-1000; + static int test4_big = (1 << 31)-1000; + static void test4(byte[] a) { + long zero = test4_BIG - test4_big; + for (int i = 0; i < RANGE; i++) { + long base = UNSAFE.ARRAY_INT_BASE_OFFSET; + long adr = base + zero + i; + byte v0 = UNSAFE.getByte(a, adr); + UNSAFE.putByte(a, adr, (byte)(v0 + 1)); + } + } } From c578f74dcdbaf9593fac9efbf55e74f201bca36f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 14:53:24 +0100 Subject: [PATCH 088/130] fix invalid case in make_with_iv_offset --- src/hotspot/share/opto/vectorization.hpp | 8 +++++++- src/hotspot/share/opto/vtransform.cpp | 6 +++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 582a59a7f6fbd..5b1af7c70ac24 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -772,7 +772,13 @@ class VPointer : public ArenaObj { VPointer make_with_iv_offset(const jint iv_offset) const { NoOverflowInt new_con = NoOverflowInt(con()) + NoOverflowInt(iv_scale()) * NoOverflowInt(iv_offset); if (new_con.is_NaN()) { - assert("false", "TODO find a case"); // TODO +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::make_with_iv_offset:"); + tty->print(" old: "); print_on(tty); + tty->print_cr(" new con overflow (iv_offset: %d) -> invalid VPointer.", iv_offset); + } +#endif return make_invalid(); } const VPointer p(_vloop, mem_pointer().make_with_con(new_con)); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index ec9ca41e54a1a..725b3ad153595 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -345,7 +345,11 @@ bool VTransformGraph::has_store_to_load_forwarding_failure(const VLoopAnalyzer& VTransformVectorNode* vector = vtn->isa_Vector(); bool is_load = vtn->is_load_in_loop(); const VPointer iv_offset_p(p.make_with_iv_offset(iv_offset)); - memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++)); + if (iv_offset_p.is_valid()) { + // The iv_offset may lead to overflows. This is a heuristic, so we do not + // care too much about those edge cases. + memory_regions.push(new VMemoryRegion(iv_offset_p, is_load, schedule_order++)); + } } } } From 8a1a37655c7e6439ca6c2267c3f6b11682eb28ca Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 14:55:04 +0100 Subject: [PATCH 089/130] fix nullptr --- src/hotspot/share/opto/mempointer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 116a2d531832c..85177b7e69230 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -795,7 +795,7 @@ class MemPointerParser : public StackObj { tty->print_cr("\nMemPointerParser::parse:"); tty->print(" mem: "); mem->dump(); parser.mem_pointer().print_on(tty); - mem->in(MemNode::Address)->dump_bfs(7, 0, "d"); + mem->in(MemNode::Address)->dump_bfs(7, nullptr, "d"); } #endif From 3cfc86e58aab25a4a3850370ae30402e39f892fa Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 15:28:48 +0100 Subject: [PATCH 090/130] SuperWord::unrolling_analysis parse all with VPointer --- src/hotspot/share/opto/superword.cpp | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 293013c2ebd89..62433faf05b28 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -172,16 +172,9 @@ void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_fa // save a queue of post process nodes if (n_ctrl != nullptr && lpt->is_member(phase->get_loop(n_ctrl))) { - // Process the memory expression - if (!adr->is_AddP()) { - NOT_PRODUCT( n->dump(); ) - NOT_PRODUCT( adr->dump(); ) - assert(false, "what is this?"); - ignored_nodes.set_ignored(adr); - } else { - // Mark the internal nodes of the address expression in ignored_nodes. - VPointer xp(current, vloop, ignored_nodes); - } + // Parse the address expression with VPointer, and mark the internal + // nodes of the address expression in ignore_nodes. + VPointer p(current, vloop, ignored_nodes); } } } From 62d1a9c26b48da4c1bde9c8f3f63c94118d28d38 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 15:48:53 +0100 Subject: [PATCH 091/130] turn assert into check --- src/hotspot/share/opto/vectorization.hpp | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 5b1af7c70ac24..941db7cd258fe 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -189,7 +189,12 @@ class VLoop : public StackObj { // before or inside the pre-loop. For example, alignment of main-loop vector // memops must be acheived in the pre-loop, via the exit check in the pre-loop. bool is_pre_loop_invariant(Node* n) const { - assert(cl()->is_main_loop(), "must be"); + // Must be in the main-loop, otherwise we can't access the pre-loop. + // This fails during SuperWord::unrolling_analysis, but that is ok. + if (!cl()->is_main_loop()) { + return false; + } + Node* ctrl = phase()->get_ctrl(n); // Quick test: is it in the main-loop? From c887b6b5429666d2ca09032d436502738b0b6144 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 15:55:24 +0100 Subject: [PATCH 092/130] overlap with invalid VPointer --- src/hotspot/share/opto/vectorization.hpp | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 941db7cd258fe..8acacb4d385fc 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -850,6 +850,14 @@ class VPointer : public ArenaObj { } bool is_adjacent_to_and_before(const VPointer& other) const { + if (!is_valid() || !other.is_valid()) { +#ifndef PRODUCT + if (_vloop.mptrace().is_trace_overlap()) { + tty->print_cr("VPointer::is_adjacent_to_and_before: invalid VPointer, adjacency unknown."); + } +#endif + return false; + } return mem_pointer().is_adjacent_to_and_before(other.mem_pointer()); } @@ -857,7 +865,7 @@ class VPointer : public ArenaObj { if (!is_valid() || !other.is_valid()) { #ifndef PRODUCT if (_vloop.mptrace().is_trace_overlap()) { - tty->print_cr("Never Overlap: false, because of invalid VPointer."); + tty->print_cr("VPointer::never_overlaps_with: invalid VPointer, overlap unknown."); } #endif return false; From 87c4df1108246afd3891b414c0aee474fb8ef900 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 16:04:40 +0100 Subject: [PATCH 093/130] one more case works! --- .../compiler/loopopts/superword/TestAlignVector.java | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java index 60d753ee75f6b..19b5075e15fad 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java @@ -969,14 +969,15 @@ static Object[] test11dL(long[] a, long[] b, long mask, int invar) { } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - IRNode.AND_VB, "= 0", - IRNode.STORE_VECTOR, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0", + IRNode.AND_VB, IRNode.VECTOR_SIZE + "min(max_byte, 4)", "> 0", + IRNode.STORE_VECTOR, "> 0"}, applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) static Object[] test12(byte[] a, byte[] b, byte mask) { for (int i = 0; i < RANGE/16; i++) { - // Currently does not vectorize at all + // Non-power-of-2 stride. Vectorization of 4 bytes, then 2-bytes gap. b[i*6 + 0 ] = (byte)(a[i*6 + 0 ] & mask); b[i*6 + 1 ] = (byte)(a[i*6 + 1 ] & mask); b[i*6 + 2 ] = (byte)(a[i*6 + 2 ] & mask); From ae36e2424096a8fbf99300a92a1038b7acc52483 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 17:17:15 +0100 Subject: [PATCH 094/130] copy array instead of initializing --- src/hotspot/share/opto/mempointer.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 85177b7e69230..2939459b570f5 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -632,12 +632,14 @@ class MemPointer : public StackObj { const NoOverflowInt new_con, const jint new_size) : NOT_PRODUCT(_trace(old._trace) COMMA) - _summands(old._summands), _con(new_con), _base(old.base()), _size(new_size) { assert(!_con.is_NaN(), "non-NaN constant"); + for (int i = 0; i < SUMMANDS_SIZE; i++) { + _summands[i] = old.summands_at(i); + } } public: From edfffc8b445a42325791750728cafcea4024f66b Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 27 Nov 2024 17:26:14 +0100 Subject: [PATCH 095/130] fix some invar sorting cases... but not all! --- .../loopopts/superword/TestMemorySegment.java | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java index 02197d94a512f..9eb97709224b3 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java @@ -456,13 +456,11 @@ static Object[] testIntLoop_longIndex_longInvar_sameAdr_byte(MemorySegment a, lo } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - IRNode.ADD_VB, "= 0", - IRNode.STORE_VECTOR, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 static Object[] testIntLoop_longIndex_intInvar_byte(MemorySegment a, int invar) { for (int i = 0; i < (int)a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); @@ -474,13 +472,11 @@ static Object[] testIntLoop_longIndex_intInvar_byte(MemorySegment a, int invar) } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - IRNode.ADD_VB, "= 0", - IRNode.STORE_VECTOR, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 static Object[] testIntLoop_longIndex_longInvar_byte(MemorySegment a, long invar) { for (int i = 0; i < (int)a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); @@ -556,13 +552,11 @@ static Object[] testIntLoop_longIndex_longInvar_sameAdr_int(MemorySegment a, lon } @Test - @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0", - IRNode.ADD_VI, "= 0", - IRNode.STORE_VECTOR, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 static Object[] testIntLoop_longIndex_intInvar_int(MemorySegment a, int invar) { for (int i = 0; i < (int)a.byteSize()/4; i++) { long adr1 = 4L * (long)(i) + 4L * (long)(invar); @@ -574,13 +568,11 @@ static Object[] testIntLoop_longIndex_intInvar_int(MemorySegment a, int invar) { } @Test - @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0", - IRNode.ADD_VI, "= 0", - IRNode.STORE_VECTOR, "= 0"}, + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 static Object[] testIntLoop_longIndex_longInvar_int(MemorySegment a, long invar) { for (int i = 0; i < (int)a.byteSize()/4; i++) { long adr1 = 4L * (long)(i) + 4L * (long)(invar); From 52053a662f66039b3d6ade518475f0ca816b6760 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 06:56:51 +0100 Subject: [PATCH 096/130] unlock diagnostics for test --- .../compiler/loopopts/superword/TestLargeScaleAndStride.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java index 7811d31cd8ef0..850285fa5046f 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java @@ -43,7 +43,7 @@ * @bug 8328938 * @modules java.base/jdk.internal.misc * @library /test/lib / - * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:SuperWordStoreToLoadForwardingFailureDetection=4096 compiler.loopopts.superword.TestLargeScaleAndStride + * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions -XX:+UnlockDiagnosticVMOptions -XX:SuperWordStoreToLoadForwardingFailureDetection=4096 compiler.loopopts.superword.TestLargeScaleAndStride */ package compiler.loopopts.superword; From 80dd0a1d0b939cfaf01cddd835cef0fa2af7ff7c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 16:17:41 +0100 Subject: [PATCH 097/130] find MemorySegment address --- src/hotspot/share/classfile/vmSymbols.hpp | 2 + src/hotspot/share/opto/mempointer.cpp | 109 ++++++++++++++++++---- src/hotspot/share/opto/mempointer.hpp | 3 + 3 files changed, 94 insertions(+), 20 deletions(-) diff --git a/src/hotspot/share/classfile/vmSymbols.hpp b/src/hotspot/share/classfile/vmSymbols.hpp index 6a6f7754c509e..6b588fabdd91e 100644 --- a/src/hotspot/share/classfile/vmSymbols.hpp +++ b/src/hotspot/share/classfile/vmSymbols.hpp @@ -356,6 +356,8 @@ class SerializeClosure; template(jdk_internal_foreign_abi_VMStorage_array_array_signature, "[[Ljdk/internal/foreign/abi/VMStorage;") \ template(jdk_internal_foreign_abi_CallConv, "jdk/internal/foreign/abi/UpcallLinker$CallRegs") \ \ + template(jdk_internal_foreign_NativeMemorySegmentImpl, "jdk/internal/foreign/NativeMemorySegmentImpl") \ + \ /* Support for JVMCI */ \ JVMCI_VM_SYMBOLS_DO(template, do_alias) \ \ diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 85c4538e4b2dd..12c7dd369e064 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -26,6 +26,7 @@ #include "opto/mempointer.hpp" #include "opto/addnode.hpp" #include "utilities/resourceHash.hpp" +#include "classfile/vmSymbols.hpp" MemPointerParser::DecomposedNodeCallback MemPointerParser::DecomposedNodeCallback::_empty; @@ -168,28 +169,34 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, De callback.callback(n); return; } + case Op_CastX2P: + // A CastX2P indicates that we are pointing to native memory, where some long is cast to + // a pointer. In general, we have no guarantees about this long, and just take it as a + // terminal summand. A CastX2P can also be a good candidate for a native-memory "base". + if (!sub_expression_has_native_base_candidate(n->in(1))) { + // General case: take CastX2P as a terminal summand, it is a candidate for the "base". + break; + } + // Fall-through: we can find a more precise native-memory "base". We further decompose + // the CastX2P to find this "base" and any other offsets from it. case Op_CastII: case Op_CastLL: case Op_ConvI2L: - // On 32bit systems we can also look through ConvL2I, since the final result will always - // be truncated back with ConvL2I. On 64bit systems we cannot decompose ConvL2I because - // such int values will eventually be expanded to long with a ConvI2L: - // - // valL = max_jint + 1 - // ConvI2L(ConvL2I(valL)) = ConvI2L(min_jint) = min_jint != max_jint + 1 = valL - // - NOT_LP64( case Op_ConvL2I: ) - { - // Decompose: look through. - Node* a = n->in(1); - _worklist.push(MemPointerSummand(a, scale)); - callback.callback(n); - return; - } - case Op_CastX2P: - // In theory, we could parse through this, and further decompose. But this is also a good - // candidate for a native-memory "base". - break; + // On 32bit systems we can also look through ConvL2I, since the final result will always + // be truncated back with ConvL2I. On 64bit systems we cannot decompose ConvL2I because + // such int values will eventually be expanded to long with a ConvI2L: + // + // valL = max_jint + 1 + // ConvI2L(ConvL2I(valL)) = ConvI2L(min_jint) = min_jint != max_jint + 1 = valL + // + NOT_LP64( case Op_ConvL2I: ) + { + // Decompose: look through. + Node* a = n->in(1); + _worklist.push(MemPointerSummand(a, scale)); + callback.callback(n); + return; + } default: // All other operations cannot be further decomposed. We just add them to the // terminal summands below. @@ -201,6 +208,66 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, De _summands.push(summand); } +bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { + // BFS over the expression. + ResourceMark rm; + GrowableArray worklist; + worklist.append(start); + for (int i = 0; i < worklist.length(); i++) { + Node* n = worklist.at(i); + n->dump(); + switch(n->Opcode()) { + case Op_AddL: + // Traverse to both inputs. + worklist.append(n->in(1)); + worklist.append(n->in(2)); + break; + case Op_SubL: + case Op_CastLL: + // Traverse to the first input. The base cannot be on the rhs of a sub. + worklist.append(n->in(1)); + break; + default: + if (is_native_memory_base_candidate(n)) { return true; } + break; + } + // This is a heuristic, so we are allowed to bail out early if the graph + // is too deep. The constant is chosen arbitrarily, not too large but big + // enough for all normal cases. + if (worklist.length() > 100) { return false; } + } + // Parsed over the whole expression, nothing found. + assert(false, "TODO rm"); + return false; +} + +// Find any special long node that we think is a better native-memory "base" +// than a CastX2P. +// TODO direct buffer? +bool MemPointerParser::is_native_memory_base_candidate(Node* n) { + // TODO rename + if (n->Opcode() == Op_CastX2P) { return true; } + // LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min + // It is used to hold the address() of a native MemorySegment. + if (n->Opcode() != Op_LoadL) { return false; } + LoadNode* load = n->as_Load(); + + const TypeInstPtr* inst_ptr = load->adr_type()->isa_instptr(); + if (inst_ptr == nullptr) { return false; } + + ciInstanceKlass* klass = inst_ptr->instance_klass(); + int offset = inst_ptr->offset(); + ciField* field = klass->get_field_by_offset(offset, false); + + Symbol* field_symbol = field->name()->get_symbol(); + Symbol* holder_symbol = field->holder()->name()->get_symbol(); + if (holder_symbol != vmSymbols::jdk_internal_foreign_NativeMemorySegmentImpl() || + field_symbol != vmSymbols::min_name()) { + return false; + } + return true; +} + // Check if the decomposition of operation opc is guaranteed to be safe. // Please refer to the definition of "safe decomposition" in mempointer.hpp bool MemPointerParser::is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const { @@ -344,7 +411,9 @@ Node* MemPointer::Base::find_base(Node* object_base, const GrowableArrayOpcode() == Op_CastX2P && s.scale().is_one()) { + if (object_base == nullptr && + s.scale().is_one() && + MemPointerParser::is_native_memory_base_candidate(s.variable())) { return s.variable(); } } diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 2939459b570f5..8f133b409add7 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -804,12 +804,15 @@ class MemPointerParser : public StackObj { return parser.mem_pointer(); } + static bool is_native_memory_base_candidate(Node* n); + private: const MemPointer& mem_pointer() const { return _mem_pointer; } MemPointer parse(DecomposedNodeCallback& callback); void parse_sub_expression(const MemPointerSummand& summand, DecomposedNodeCallback& callback); + static bool sub_expression_has_native_base_candidate(Node* n); bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; }; From 7adcb6f5ac9c1259b0ad203f1e5eb7d76dfc64f4 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 16:35:13 +0100 Subject: [PATCH 098/130] fix base casting --- src/hotspot/share/opto/mempointer.cpp | 1 - src/hotspot/share/opto/superword.cpp | 32 ++++++++++++++++++--------- 2 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 12c7dd369e064..0e2542ccffdc2 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -215,7 +215,6 @@ bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { worklist.append(start); for (int i = 0; i < worklist.length(); i++) { Node* n = worklist.at(i); - n->dump(); switch(n->Opcode()) { case Op_AddL: // Traverse to both inputs. diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 62433faf05b28..d47e5249e7c7a 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2927,17 +2927,27 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // 1.3: base (unless base is guaranteed aw aligned) if (aw > ObjectAlignmentInBytes || is_base_native) { - // The base is only aligned with ObjectAlignmentInBytes with arrays. - // When the base() is top, we have no alignment guarantee at all. - // Hence, we must now take the base into account for the calculation. - Node* xbase = new CastP2XNode(nullptr, base); - phase()->register_new_node(xbase, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(xbase); -#ifdef _LP64 - xbase = new ConvL2INode(xbase); - phase()->register_new_node(xbase, pre_ctrl); - TRACE_ALIGN_VECTOR_NODE(xbase); -#endif + // For objects, the base is ObjectAlignmentInBytes aligned. + // For native memory, we have no such guarantee, and must + // always take the base into account for the calculation. + // + // Computations are done % (vector width/element size) so it's + // safe to simply convert invar to an int and loose the upper 32 + // bit half. The base could be ptr, long or int. We cast all + // to int. + Node* xbase = base; + if (igvn().type(xbase)->isa_ptr()) { + // ptr -> int/long + xbase = new CastP2XNode(nullptr, xbase); + phase()->register_new_node(xbase, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(xbase); + } + if (igvn().type(xbase)->isa_long()) { + // long -> int + xbase = new ConvL2INode(xbase); + phase()->register_new_node(xbase, pre_ctrl); + TRACE_ALIGN_VECTOR_NODE(xbase); + } if (is_sub) { xbic = new SubINode(xbic, xbase); } else { From dbdab4923a70c8a0e39247a6af907d4a6b30e090 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 16:39:11 +0100 Subject: [PATCH 099/130] rm assert and add comments --- src/hotspot/share/opto/mempointer.cpp | 1 - src/hotspot/share/opto/superword.cpp | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 0e2542ccffdc2..60cb535c8270e 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -236,7 +236,6 @@ bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { if (worklist.length() > 100) { return false; } } // Parsed over the whole expression, nothing found. - assert(false, "TODO rm"); return false; } diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index d47e5249e7c7a..e45a9547a6c96 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2928,8 +2928,10 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { // 1.3: base (unless base is guaranteed aw aligned) if (aw > ObjectAlignmentInBytes || is_base_native) { // For objects, the base is ObjectAlignmentInBytes aligned. - // For native memory, we have no such guarantee, and must - // always take the base into account for the calculation. + // For native memory, we simply have a long that was cast to + // a pointer via CastX2P, or if we parsed through the CastX2P + // we only have a long. There is no alignment guarantee, and + // we must always take the base into account for the calculation. // // Computations are done % (vector width/element size) so it's // safe to simply convert invar to an int and loose the upper 32 From 046a949a9efb75f82e4c2766b9a87203a1b64e0d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 17:08:57 +0100 Subject: [PATCH 100/130] fix up test --- .../loopopts/superword/TestMemorySegment.java | 22 ++++++++++--------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java index 9eb97709224b3..4040527a58ecb 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java @@ -645,13 +645,14 @@ static Object[] testLongLoop_longIndex_longInvar_sameAdr_byte(MemorySegment a, l } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - IRNode.ADD_VB, "= 0", - IRNode.STORE_VECTOR, "= 0"}, - applyIfPlatform = {"64-bit", "true"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + // IRNode.ADD_VB, "= 0", + // IRNode.STORE_VECTOR, "= 0"}, + // applyIfPlatform = {"64-bit", "true"}, + // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. // See: JDK-8330274 + // Interestingly, it now passes for native, but not for objects. static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) { for (long i = 0; i < a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); @@ -663,13 +664,14 @@ static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) } @Test - @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - IRNode.ADD_VB, "= 0", - IRNode.STORE_VECTOR, "= 0"}, - applyIfPlatform = {"64-bit", "true"}, - applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + // IRNode.ADD_VB, "= 0", + // IRNode.STORE_VECTOR, "= 0"}, + // applyIfPlatform = {"64-bit", "true"}, + // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. // See: JDK-8330274 + // Interestingly, it now passes for native, but not for objects. static Object[] testLongLoop_longIndex_longInvar_byte(MemorySegment a, long invar) { for (long i = 0; i < a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); From dafce8841f6c2999c5cb7b393bad9e4728ad7bed Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 28 Nov 2024 17:14:24 +0100 Subject: [PATCH 101/130] update comments --- src/hotspot/share/opto/mempointer.cpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 60cb535c8270e..ee278b279a8d0 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -239,14 +239,15 @@ bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { return false; } -// Find any special long node that we think is a better native-memory "base" -// than a CastX2P. -// TODO direct buffer? +// Check if the node is a candidate to be a memory segment "base". +// (1) CastX2P +// (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min +// Holds the address() of a native memory segment. bool MemPointerParser::is_native_memory_base_candidate(Node* n) { - // TODO rename + // (1) CastX2P if (n->Opcode() == Op_CastX2P) { return true; } - // LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min - // It is used to hold the address() of a native MemorySegment. + + // (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min if (n->Opcode() != Op_LoadL) { return false; } LoadNode* load = n->as_Load(); From 539b50dcf39e6d2348159d189cca0f13102935e9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 29 Nov 2024 08:04:49 +0100 Subject: [PATCH 102/130] check if field not found --- src/hotspot/share/opto/mempointer.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index ee278b279a8d0..4311b2f5ee2db 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -257,6 +257,7 @@ bool MemPointerParser::is_native_memory_base_candidate(Node* n) { ciInstanceKlass* klass = inst_ptr->instance_klass(); int offset = inst_ptr->offset(); ciField* field = klass->get_field_by_offset(offset, false); + if (field == nullptr) { return false; } Symbol* field_symbol = field->name()->get_symbol(); Symbol* holder_symbol = field->holder()->name()->get_symbol(); From db4b0d104bbaf63bd422217293aff6b4149cf2e5 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 29 Nov 2024 08:57:44 +0100 Subject: [PATCH 103/130] some comment and naming improvements --- src/hotspot/share/opto/mempointer.cpp | 8 +++---- src/hotspot/share/opto/mempointer.hpp | 33 ++++++++++++++++++++++++++- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 4311b2f5ee2db..25e683fe2ed1a 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -240,7 +240,7 @@ bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { } // Check if the node is a candidate to be a memory segment "base". -// (1) CastX2P +// (1) CastX2P: some arbitrary long that is cast to a pointer. // (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min // Holds the address() of a native memory segment. bool MemPointerParser::is_native_memory_base_candidate(Node* n) { @@ -412,7 +412,7 @@ Node* MemPointer::Base::find_base(Node* object_base, const GrowableArray why want a good base that is probably aligned, +// and it must be same for different MemPointer if possible -> challenging +// (1) CastX2P +// This is simply some arbitrary long cast to a pointer. It may be computed as an addition of +// multiple long and even int values. In some cases this means that we could have further +// decomposed the CastX2P further, but at that point it is even harder to tell what should be +// a good candidate for a native memory base. TODO +// (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min +// This is especially interesting because it holds the address() of a native MemorySegment. +// // ----------------------------------------------------------------------------------------- // // We have to be careful on 64-bit systems with ConvI2L: decomposing its input is not @@ -675,7 +706,7 @@ class MemPointer : public StackObj { bool has_same_summands_as(const MemPointer& other, uint start) const; bool has_same_summands_as(const MemPointer& other) const { return has_same_summands_as(other, 0); } - bool has_different_base_but_otherwise_same_summands_as(const MemPointer& other) const; + bool has_different_object_base_but_otherwise_same_summands_as(const MemPointer& other) const; public: bool has_same_non_base_summands_as(const MemPointer& other) const { From 133b1460811bf1fa252f5293d427845cc348ec04 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sun, 1 Dec 2024 14:31:42 +0100 Subject: [PATCH 104/130] make sort stable --- src/hotspot/share/opto/superword.cpp | 12 +++++++++--- src/hotspot/share/opto/superword.hpp | 14 +++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index e45a9547a6c96..4e85e05bf8487 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -510,7 +510,7 @@ int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { b->vpointer().mem_pointer()); } -int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { +int SuperWord::MemOp::cmp_by_group_and_con_and_original_index(MemOp* a, MemOp* b) { // Group int cmp_group = cmp_by_group(a, b); if (cmp_group != 0) { return cmp_group; } @@ -520,6 +520,8 @@ int SuperWord::MemOp::cmp_by_group_and_con(MemOp* a, MemOp* b) { jint b_con = b->vpointer().mem_pointer().con().value(); RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); + RETURN_CMP_VALUE_IF_NOT_EQUAL(a->original_index(), b->original_index()); + return 0; } @@ -537,7 +539,10 @@ void SuperWord::create_adjacent_memop_pairs() { // This decreases the work. // - VPointer con: Sorting by VPointer con inside the group allows us to perform a sliding // window algorithm, to determine adjacent memops efficiently. - memops.sort(MemOp::cmp_by_group_and_con); + // Since GrowableArray::sort relies on qsort, the sort is not stable on its own. This can lead + // to worse packing in some cases. To make the sort stable, our last cmp criterion is the + // original index, i.e. the position in the memops array before sorting. + memops.sort(MemOp::cmp_by_group_and_con_and_original_index); #ifndef PRODUCT if (is_trace_superword_adjacent_memops()) { @@ -557,12 +562,13 @@ void SuperWord::create_adjacent_memop_pairs() { // Collect all memops that could potentially be vectorized. void SuperWord::collect_valid_memops(GrowableArray& memops) { + int original_index = 0; for_each_mem([&] (MemNode* mem, int bb_idx) { const VPointer& p = vpointer(mem); if (p.is_valid() && !mem->is_LoadStore() && is_java_primitive(mem->memory_type())) { - memops.append(MemOp(mem, &p)); + memops.append(MemOp(mem, &p, original_index++)); } }); } diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index c51e9cb418edb..3d10a01ecc623 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -565,17 +565,25 @@ class SuperWord : public ResourceObj { private: MemNode* _mem; const VPointer* _vpointer; + int _original_index; public: // Empty, for GrowableArray - MemOp() : _mem(nullptr), _vpointer(nullptr) {} - MemOp(MemNode* mem, const VPointer* vpointer) : _mem(mem), _vpointer(vpointer) {} + MemOp() : + _mem(nullptr), + _vpointer(nullptr), + _original_index(-1) {} + MemOp(MemNode* mem, const VPointer* vpointer, int original_index) : + _mem(mem), + _vpointer(vpointer), + _original_index(original_index) {} MemNode* mem() const { return _mem; } const VPointer& vpointer() const { return *_vpointer; } + int original_index() const { return _original_index; } static int cmp_by_group(MemOp* a, MemOp* b); - static int cmp_by_group_and_con(MemOp* a, MemOp* b); + static int cmp_by_group_and_con_and_original_index(MemOp* a, MemOp* b); }; void create_adjacent_memop_pairs(); void collect_valid_memops(GrowableArray& memops); From 71eefe9fb5bc63161b6d8ddcce7b928340ccd0da Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sun, 1 Dec 2024 15:39:21 +0100 Subject: [PATCH 105/130] hide parser via delegation --- src/hotspot/share/opto/memnode.cpp | 4 +-- src/hotspot/share/opto/mempointer.cpp | 13 +++++-- src/hotspot/share/opto/mempointer.hpp | 45 +++++++++++++----------- src/hotspot/share/opto/superword.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 10 +++--- 5 files changed, 42 insertions(+), 32 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index ca64881b25c7e..f9f1135b33ad3 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,8 +2947,8 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_pointer_adjacency(), true); #endif - const MemPointer pointer_use(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) use_store)); - const MemPointer pointer_def(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) def_store)); + const MemPointer pointer_use(NOT_PRODUCT(trace COMMA) use_store); + const MemPointer pointer_def(NOT_PRODUCT(trace COMMA) def_store); return pointer_def.is_adjacent_to_and_before(pointer_use); } diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 25e683fe2ed1a..b1aa584dfe618 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -28,11 +28,18 @@ #include "utilities/resourceHash.hpp" #include "classfile/vmSymbols.hpp" -MemPointerParser::DecomposedNodeCallback MemPointerParser::DecomposedNodeCallback::_empty; +MemPointerParserCallback MemPointerParserCallback::_empty; + +MemPointer::MemPointer(NOT_PRODUCT(const TraceMemPointer& trace COMMA) + const MemNode* mem, + MemPointerParserCallback& callback) : + MemPointer(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) + mem, + callback)) {} // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointer MemPointerParser::parse(DecomposedNodeCallback& callback) { +MemPointer MemPointerParser::parse(MemPointerParserCallback& callback) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); @@ -91,7 +98,7 @@ MemPointer MemPointerParser::parse(DecomposedNodeCallback& callback) { // Parse a sub-expression of the pointer, starting at the current summand. We parse the // current node, and see if it can be decomposed into further summands, or if the current // summand is terminal. -void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, DecomposedNodeCallback& callback) { +void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, MemPointerParserCallback& callback) { Node* n = summand.variable(); const NoOverflowInt scale = summand.scale(); const NoOverflowInt one(1); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 86c3dda86bff1..39e9e9b8d51c1 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -538,6 +538,22 @@ class MemPointerSummand : public StackObj { #endif }; +// Parsing calls the callback on every decomposed node. These are all the +// nodes on the paths from the pointer to the summand variables, i.e. the +// "inner" nodes of the pointer expression. This callback is for example +// used in SuperWord::unrolling_analysis to collect all inner nodes of a +// pointer expression. +class MemPointerParserCallback : public StackObj { +private: + static MemPointerParserCallback _empty; + +public: + virtual void callback(Node* n) { /* do nothing by default */ } + + // Singleton for default arguments. + static MemPointerParserCallback& empty() { return _empty; } +}; + // Decomposed form of the pointer sub-expression of "pointer". // // pointer = SUM(summands) + con @@ -674,6 +690,11 @@ class MemPointer : public StackObj { } public: + // Parse pointer of MemNode. Delegates to MemPointerParser::parse. + MemPointer(NOT_PRODUCT(const TraceMemPointer& trace COMMA) + const MemNode* mem, + MemPointerParserCallback& callback = MemPointerParserCallback::empty()); + static MemPointer make_trivial(Node* pointer, const jint size NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { @@ -778,22 +799,6 @@ class MemPointer : public StackObj { }; class MemPointerParser : public StackObj { -public: - // Parsing calls the callback on every decomposed node. These are all the - // nodes on the paths from the pointer to the summand variables, i.e. the - // "inner" nodes of the pointer expression. This callback allows collecting - // all such nodes of a pointer expression. - class DecomposedNodeCallback : public StackObj { - private: - static DecomposedNodeCallback _empty; - - public: - virtual void callback(Node* n) { /* do nothing by default */ } - - // Singleton for default arguments. - static DecomposedNodeCallback& empty() { return _empty; } - }; - private: NOT_PRODUCT( const TraceMemPointer& _trace; ) @@ -809,7 +814,7 @@ class MemPointerParser : public StackObj { public: MemPointerParser(const MemNode* mem, - DecomposedNodeCallback& callback + MemPointerParserCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), @@ -818,7 +823,7 @@ class MemPointerParser : public StackObj { static MemPointer parse(NOT_PRODUCT(const TraceMemPointer& trace COMMA) const MemNode* mem, - DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) { + MemPointerParserCallback& callback = MemPointerParserCallback::empty()) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); ResourceMark rm; MemPointerParser parser(mem, callback NOT_PRODUCT(COMMA trace)); @@ -840,9 +845,9 @@ class MemPointerParser : public StackObj { private: const MemPointer& mem_pointer() const { return _mem_pointer; } - MemPointer parse(DecomposedNodeCallback& callback); + MemPointer parse(MemPointerParserCallback& callback); - void parse_sub_expression(const MemPointerSummand& summand, DecomposedNodeCallback& callback); + void parse_sub_expression(const MemPointerSummand& summand, MemPointerParserCallback& callback); static bool sub_expression_has_native_base_candidate(Node* n); bool is_safe_to_decompose_op(const int opc, const NoOverflowInt& scale) const; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 4e85e05bf8487..c6b50d690616f 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -50,7 +50,7 @@ SuperWord::SuperWord(const VLoopAnalyzer &vloop_analyzer) : } // Collect ignored loop nodes during VPointer parsing. -class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParser::DecomposedNodeCallback { +class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParserCallback { private: const VLoop& _vloop; const Node_List& _body; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 8acacb4d385fc..8010b12e2ddb3 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -723,8 +723,6 @@ class VLoopAnalyzer : StackObj { // class VPointer : public ArenaObj { private: - typedef MemPointerParser::DecomposedNodeCallback DecomposedNodeCallback; - const VLoop& _vloop; const MemPointer _mem_pointer; @@ -744,11 +742,11 @@ class VPointer : public ArenaObj { public: VPointer(const MemNode* mem, const VLoop& vloop, - DecomposedNodeCallback& callback = DecomposedNodeCallback::empty()) : + MemPointerParserCallback& callback = MemPointerParserCallback::empty()) : VPointer(vloop, - MemPointerParser::parse(NOT_PRODUCT(vloop.mptrace() COMMA) - mem, - callback)) + MemPointer(NOT_PRODUCT(vloop.mptrace() COMMA) + mem, + callback)) { #ifndef PRODUCT if (vloop.mptrace().is_trace_parsing()) { From d6bf696789a66c29f9f95499feb9da4deb18b5ac Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sun, 1 Dec 2024 16:01:21 +0100 Subject: [PATCH 106/130] improve documentation --- src/hotspot/share/opto/mempointer.hpp | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 39e9e9b8d51c1..822c5fdf3ab0c 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -225,19 +225,23 @@ // alias. // // - Native (off-heap) base (MemPointer::base().is_native()): -// It is a pointer into off-heap memory. We do not know if it points at the beginning or into the -// middle of some off-heap allocated memory. We have no guarantees about the alignment either. All -// we require, is that it is a summand with a scale = 1, and that it is accepted as a -// MemPointer::is_native_memory_base_candidate. It can thus be one of these: -// TODO continue talking about alignment use case -> why want a good base that is probably aligned, -// and it must be same for different MemPointer if possible -> challenging +// When we decompose a pointer to native memory, it is at first not clear that there is a base address. +// Even if we could know that there is some base address to which we add index offsets, we cannot know +// if this reference address points to the beginning of a native memory allocation or into the middle, +// or outside it. We also have no guarantee for alignment with such a base address. +// Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the +// same summands), we would like to find the same base. Further, it is reasonable to speculatively +// assume that such base addresses are aligned (need to add this speculative check in JDK-8323582). +// A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate. +// It can thus be one of these: // (1) CastX2P // This is simply some arbitrary long cast to a pointer. It may be computed as an addition of // multiple long and even int values. In some cases this means that we could have further -// decomposed the CastX2P further, but at that point it is even harder to tell what should be -// a good candidate for a native memory base. TODO +// decomposed the CastX2P, but at that point it is even harder to tell what should be a good +// candidate for a native memory base. // (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min -// This is especially interesting because it holds the address() of a native MemorySegment. +// This would be preferrable over CastX2P, because it holds the address() of a native +// MemorySegment, i.e. we know it points to the beginning of that MemorySegment. // // ----------------------------------------------------------------------------------------- // From b5de584c89e5cd6b13dc00ef2e96ff8f5ea38d37 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sun, 1 Dec 2024 16:32:57 +0100 Subject: [PATCH 107/130] add TestEquivalentInvariants.java --- .../superword/TestEquivalentInvariants.java | 951 ++++++++++++++++++ 1 file changed, 951 insertions(+) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java new file mode 100644 index 0000000000000..074071e96d218 --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java @@ -0,0 +1,951 @@ +/* + * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; +import jdk.test.lib.Utils; +import jdk.internal.misc.Unsafe; +import java.lang.reflect.Array; +import java.util.Map; +import java.util.HashMap; +import java.util.Random; +import java.lang.foreign.*; + +/* + * @test + * @bug 8343685 8330274 + * @summary Test vectorization with various invariants that are equivalent, but not trivially so, + * i.e. where the invariants have the same summands, but in a different order. + * @modules java.base/jdk.internal.misc + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestEquivalentInvariants + */ + +public class TestEquivalentInvariants { + static int RANGE = 1024*64; + private static final Unsafe UNSAFE = Unsafe.getUnsafe(); + private static final Random RANDOM = Utils.getRandomInstance(); + + // Inputs + byte[] aB; + byte[] bB; + int[] aI; + int[] bI; + long[] aL; + long[] bL; + + // List of tests + Map tests = new HashMap(); + + // List of gold, the results from the first run before compilation + Map golds = new HashMap(); + + interface TestFunction { + Object[] run(); + } + + public static void main(String[] args) { + TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "-XX:-AlignVector"); + TestFramework.runWithFlags("--add-modules", "java.base", "--add-exports", "java.base/jdk.internal.misc=ALL-UNNAMED", + "-XX:+AlignVector"); + } + + public TestEquivalentInvariants() { + // Generate input once + aB = generateB(); + bB = generateB(); + aI = generateI(); + bI = generateI(); + aL = generateL(); + bL = generateL(); + + // Add all tests to list + tests.put("testArrayBB", () -> { + return testArrayBB(aB.clone(), bB.clone()); + }); + tests.put("testArrayBBInvar3", () -> { + return testArrayBBInvar3(aB.clone(), bB.clone(), 0, 0, 0); + }); + tests.put("testMemorySegmentB", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentB(data); + }); + tests.put("testMemorySegmentBInvarI", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI(data, 101, RANGE-200); + }); + tests.put("testMemorySegmentBInvarL", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarL(data, 101, RANGE-200); + }); + tests.put("testMemorySegmentBInvarIAdr", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarIAdr(data, 101, RANGE-200); + }); + tests.put("testMemorySegmentBInvarLAdr", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarLAdr(data, 101, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3a", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3a(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3b", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3b(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3c", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3c(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3d", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3d(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3e", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3e(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarI3f", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarI3f(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarL3g", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarL3g(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarL3h", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarL3h(data, -1, -2, -3, RANGE-200); + }); + tests.put("testMemorySegmentBInvarL3k", () -> { + MemorySegment data = MemorySegment.ofArray(aB.clone()); + return testMemorySegmentBInvarL3k(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3a", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3a(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3b", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3b(data, -1, -2, -3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3c", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3c(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3d", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3d(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3d2", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3d2(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3d3", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3d3(data, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3e", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3e(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentIInvarL3f", () -> { + MemorySegment data = MemorySegment.ofArray(aI.clone()); + return testMemorySegmentIInvarL3f(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3a", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3a(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3b", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3b(data, -1, -2, -3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3c", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3c(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3d", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3d(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3d2", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3d2(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3d3", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3d3(data, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3e", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3e(data, 1, 2, 3, RANGE-200); + }); + tests.put("testMemorySegmentLInvarL3f", () -> { + MemorySegment data = MemorySegment.ofArray(aL.clone()); + return testMemorySegmentLInvarL3f(data, 1, 2, 3, RANGE-200); + }); + tests.put("testLargeInvariantSum", () -> { + return testLargeInvariantSum(aB.clone(), 0, 0, 0, RANGE-200); + }); + + // Compute gold value for all test methods before compilation + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + Object[] gold = test.run(); + golds.put(name, gold); + } + } + + @Warmup(100) + @Run(test = {"testArrayBB", + "testArrayBBInvar3", + "testMemorySegmentB", + "testMemorySegmentBInvarI", + "testMemorySegmentBInvarL", + "testMemorySegmentBInvarIAdr", + "testMemorySegmentBInvarLAdr", + "testMemorySegmentBInvarI3a", + "testMemorySegmentBInvarI3b", + "testMemorySegmentBInvarI3c", + "testMemorySegmentBInvarI3d", + "testMemorySegmentBInvarI3e", + "testMemorySegmentBInvarI3f", + "testMemorySegmentBInvarL3g", + "testMemorySegmentBInvarL3h", + "testMemorySegmentBInvarL3k", + "testMemorySegmentIInvarL3a", + "testMemorySegmentIInvarL3b", + "testMemorySegmentIInvarL3c", + "testMemorySegmentIInvarL3d", + "testMemorySegmentIInvarL3d2", + "testMemorySegmentIInvarL3d3", + "testMemorySegmentIInvarL3e", + "testMemorySegmentIInvarL3f", + "testMemorySegmentLInvarL3a", + "testMemorySegmentLInvarL3b", + "testMemorySegmentLInvarL3c", + "testMemorySegmentLInvarL3d", + "testMemorySegmentLInvarL3d2", + "testMemorySegmentLInvarL3d3", + "testMemorySegmentLInvarL3e", + "testMemorySegmentLInvarL3f", + "testLargeInvariantSum"}) + public void runTests() { + for (Map.Entry entry : tests.entrySet()) { + String name = entry.getKey(); + TestFunction test = entry.getValue(); + // Recall gold value from before compilation + Object[] gold = golds.get(name); + // Compute new result + Object[] result = test.run(); + // Compare gold and new result + verify(name, gold, result); + } + } + + static byte[] generateB() { + byte[] a = new byte[RANGE]; + for (int i = 0; i < a.length; i++) { + a[i] = (byte)RANDOM.nextInt(); + } + return a; + } + + static short[] generateS() { + short[] a = new short[RANGE]; + for (int i = 0; i < a.length; i++) { + a[i] = (short)RANDOM.nextInt(); + } + return a; + } + + static int[] generateI() { + int[] a = new int[RANGE]; + for (int i = 0; i < a.length; i++) { + a[i] = RANDOM.nextInt(); + } + return a; + } + + static long[] generateL() { + long[] a = new long[RANGE]; + for (int i = 0; i < a.length; i++) { + a[i] = RANDOM.nextLong(); + } + return a; + } + + static void verify(String name, Object[] gold, Object[] result) { + if (gold.length != result.length) { + throw new RuntimeException("verify " + name + ": not the same number of outputs: gold.length = " + + gold.length + ", result.length = " + result.length); + } + for (int i = 0; i < gold.length; i++) { + Object g = gold[i]; + Object r = result[i]; + if (g == r) { + throw new RuntimeException("verify " + name + ": should be two separate objects (with identical content):" + + " gold[" + i + "] == result[" + i + "]"); + } + + // Wrap everything in MemorySegments, this allows simple value verification of Array as well as MemorySegment. + MemorySegment mg = null; + MemorySegment mr = null; + if (g.getClass().isArray()) { + if (g.getClass() != r.getClass() || !g.getClass().isArray() || !r.getClass().isArray()) { + throw new RuntimeException("verify " + name + ": must both be array of same type:" + + " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + + " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); + } + if (Array.getLength(g) != Array.getLength(r)) { + throw new RuntimeException("verify " + name + ": arrays must have same length:" + + " gold[" + i + "].length = " + Array.getLength(g) + + " result[" + i + "].length = " + Array.getLength(r)); + } + Class c = g.getClass().getComponentType(); + if (c == byte.class) { + mg = MemorySegment.ofArray((byte[])g); + mr = MemorySegment.ofArray((byte[])r); + } else if (c == int.class) { + mg = MemorySegment.ofArray((int[])g); + mr = MemorySegment.ofArray((int[])r); + } else if (c == long.class) { + mg = MemorySegment.ofArray((long[])g); + mr = MemorySegment.ofArray((long[])r); + } else { + throw new RuntimeException("verify " + name + ": array type not supported for verify:" + + " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + + " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); + } + } else if (g instanceof MemorySegment) { + mg = (MemorySegment)g; + if (!(r instanceof MemorySegment)) { + throw new RuntimeException("verify " + name + ": was not both MemorySegment:" + + " gold[" + i + "].getClass() = " + g.getClass().getSimpleName() + + " result[" + i + "].getClass() = " + r.getClass().getSimpleName()); + } + mr = (MemorySegment)r; + } + + if (mg.byteSize() != mr.byteSize()) { + throw new RuntimeException("verify " + name + ": memory segment must have same length:" + + " gold[" + i + "].length = " + mg.byteSize() + + " result[" + i + "].length = " + mr.byteSize()); + } + verifyMS(name, i, mg, mr); + } + } + + static void verifyMS(String name, int i, MemorySegment g, MemorySegment r) { + for (long j = 0; j < g.byteSize(); j++) { + byte vg = g.get(ValueLayout.JAVA_BYTE, j); + byte vr = r.get(ValueLayout.JAVA_BYTE, j); + if (vg != vr) { + throw new RuntimeException("verify " + name + ": arrays must have same content:" + + " gold[" + i + "][" + j + "] = " + vg + + " result[" + i + "][" + j + "] = " + vr); + } + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testArrayBB(byte[] a, byte[] b) { + for (int i = 0; i < a.length; i++) { + b[i+0] = (byte)(a[i] + 1); + } + return new Object[]{ a, b }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Same int invariant summands, but added in a different order. + static Object[] testArrayBBInvar3(byte[] a, byte[] b, int invar1, int invar2, int invar3) { + int i1 = invar1 + invar2 + invar3; + int i2 = invar2 + invar3 + invar1; + for (int i = 0; i < a.length; i++) { + b[i + i1] = (byte)(a[i + i2] + 1); + } + return new Object[]{ a, b }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Just a simple pattern, without any (explicit) invariant. + static Object[] testMemorySegmentB(MemorySegment m) { + for (int i = 0; i < (int)m.byteSize(); i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i); + m.set(ValueLayout.JAVA_BYTE, i, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Does not vectorize: RangeChecks are not eliminated. + // Filed RFE: JDK-8327209 + static Object[] testMemorySegmentBInvarI(MemorySegment m, int invar, int size) { + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + invar); + m.set(ValueLayout.JAVA_BYTE, i + invar, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Has different invariants, before sorting: + // + // 3125 AddL = ((CastLL(Param 11) + ConvI2L(1460 Phi)) + 530 LoadL) + // 3127 AddL = (ConvI2L(1460 Phi) + (11 Param + 530 LoadL)) + // + static Object[] testMemorySegmentBInvarL(MemorySegment m, long invar, int size) { + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + invar); + m.set(ValueLayout.JAVA_BYTE, i + invar, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Does not vectorize: RangeChecks are not eliminated. + // Filed RFE: JDK-8327209 + static Object[] testMemorySegmentBInvarIAdr(MemorySegment m, int invar, int size) { + for (int i = 0; i < size; i++) { + long adr = i + invar; + byte v = m.get(ValueLayout.JAVA_BYTE, adr); + m.set(ValueLayout.JAVA_BYTE, adr, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Since we add "i + invar", the invariant is already equivalent without sorting. + static Object[] testMemorySegmentBInvarLAdr(MemorySegment m, long invar, int size) { + for (int i = 0; i < size; i++) { + long adr = i + invar; + byte v = m.get(ValueLayout.JAVA_BYTE, adr); + m.set(ValueLayout.JAVA_BYTE, adr, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarI3a(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3 + invar1); // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarI3b(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3 + invar1); // equivalent + for (int i = 0; i < size; i+=2) { + byte v0 = m.get(ValueLayout.JAVA_BYTE, i + i1 + 0); + byte v1 = m.get(ValueLayout.JAVA_BYTE, i + i2 + 1); + m.set(ValueLayout.JAVA_BYTE, i + i1 + 0, (byte)(v0 + 1)); + m.set(ValueLayout.JAVA_BYTE, i + i2 + 1, (byte)(v1 + 1)); + } + return new Object[]{ m }; + } + + @Test + // Currently, we don't vectorize. But we may vectorize this, once we implement something like aliasing analysis, + // though in this particular case we know that the values at runtime will alias. + static Object[] testMemorySegmentBInvarI3c(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3) + (long)(invar1); // not equivalent! + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarI3d(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3) + (long)(invar1); + for (int i = 0; i < size; i+=2) { + byte v0 = m.get(ValueLayout.JAVA_BYTE, i + i1 + 0); + byte v1 = m.get(ValueLayout.JAVA_BYTE, i + i2 + 1); + m.set(ValueLayout.JAVA_BYTE, i + i1 + 0, (byte)(v0 + 1)); + m.set(ValueLayout.JAVA_BYTE, i + i2 + 1, (byte)(v1 + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarI3e(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 + invar2 - invar3); + long i2 = (long)(invar2 - invar3 + invar1); // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarI3f(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(invar1 - (invar2 - invar3)); + long i2 = (long)(-invar2 + invar3 + invar1); // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarL3g(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = invar1 - (invar2 - invar3); + long i2 = -invar2 + invar3 + invar1; // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarL3h(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 - invar2 - invar3; + long i2 = -invar2 - invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_B, "> 0", + IRNode.ADD_VB, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentBInvarL3k(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 + invar2 + invar3; + long i2 = invar2 + invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + byte v = m.get(ValueLayout.JAVA_BYTE, i + i1); + m.set(ValueLayout.JAVA_BYTE, i + i2, (byte)(v + 1)); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentIInvarL3a(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = invar1 + invar2 + invar3; + long i2 = invar2 + invar3 + invar1; // equivalent + for (int i = 0; i < size; i++) { + int v = m.getAtIndex(ValueLayout.JAVA_INT, i + i1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentIInvarL3b(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 - invar2 - invar3; + long i2 = -invar2 - invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + int v = m.getAtIndex(ValueLayout.JAVA_INT, i + i1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentIInvarL3c(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 + invar2 + invar3; + long i2 = invar2 + invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + int v = m.getAtIndex(ValueLayout.JAVA_INT, i + i1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Would be nice if it vectorized. + // Fails because of control flow. Somehow the "offsetPlain" check (checks for alignment) is not folded away. + static Object[] testMemorySegmentIInvarL3d(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3 - invar1); // equivalent + for (int i = 0; i < size; i+=2) { + int v0 = m.getAtIndex(ValueLayout.JAVA_INT, i + i1 + 0); + int v1 = m.getAtIndex(ValueLayout.JAVA_INT, i + i2 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i2 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Would be nice if it vectorized. + // Fails because of control flow. Somehow the "offsetPlain" check (checks for alignment) is not folded away. + static Object[] testMemorySegmentIInvarL3d2(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + for (int i = 0; i < size; i+=2) { + int v0 = m.getAtIndex(ValueLayout.JAVA_INT, i + i1 + 0); + int v1 = m.getAtIndex(ValueLayout.JAVA_INT, i + i1 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i1 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // But here the "offsetPlain" is folded away + static Object[] testMemorySegmentIInvarL3d3(MemorySegment m, int size) { + for (int i = 0; i < size; i+=2) { + int v0 = m.getAtIndex(ValueLayout.JAVA_INT, i + 0); + int v1 = m.getAtIndex(ValueLayout.JAVA_INT, i + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Should never vectorize, since i1 and i2 are not guaranteed to be adjacent + // invar2 + invar3 could overflow, and the address be valid with and without overflow. + // So both addresses are valid, and not adjacent. + static Object[] testMemorySegmentIInvarL3e(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3) - (long)(invar1); // not equivalent + for (int i = 0; i < size; i+=2) { + int v0 = m.getAtIndex(ValueLayout.JAVA_INT, i + i1 + 0); + int v1 = m.getAtIndex(ValueLayout.JAVA_INT, i + i2 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_INT, i + i2 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_I, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentIInvarL3f(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 + invar2 + invar3; + long i2 = invar2 + invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + // Scale the index manually + int v = m.get(ValueLayout.JAVA_INT, 4 * (i + i1)); + m.set(ValueLayout.JAVA_INT, 4 * (i + i2), v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_VL, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentLInvarL3a(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = invar1 + invar2 + invar3; + long i2 = invar2 + invar3 + invar1; // equivalent + for (int i = 0; i < size; i++) { + long v = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_VL, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentLInvarL3b(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 - invar2 - invar3; + long i2 = -invar2 - invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + long v = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_VL, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentLInvarL3c(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 + invar2 + invar3; + long i2 = invar2 + invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + long v = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i2, v + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Would be nice if it vectorized. + // Fails because of control flow. Somehow the "offsetPlain" check (checks for alignment) is not folded away. + static Object[] testMemorySegmentLInvarL3d(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3 - invar1); // equivalent + for (int i = 0; i < size; i+=2) { + long v0 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0); + long v1 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i2 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i2 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // Would be nice if it vectorized. + // Fails because of control flow. Somehow the "offsetPlain" check (checks for alignment) is not folded away. + static Object[] testMemorySegmentLInvarL3d2(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + for (int i = 0; i < size; i+=2) { + long v0 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0); + long v1 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i1 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_VL, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // But here the "offsetPlain" is folded away + static Object[] testMemorySegmentLInvarL3d3(MemorySegment m, int size) { + for (int i = 0; i < size; i+=2) { + long v0 = m.getAtIndex(ValueLayout.JAVA_LONG, i + 0); + long v1 = m.getAtIndex(ValueLayout.JAVA_LONG, i + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "= 0", + IRNode.STORE_VECTOR, "= 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + // FAILS: should be ok to vectorize, but does not. Investigate in JDK-8330274. + static Object[] testMemorySegmentLInvarL3e(MemorySegment m, int invar1, int invar2, int invar3, int size) { + long i1 = (long)(-invar1 + invar2 + invar3); + long i2 = (long)(invar2 + invar3) - (long)(invar1); // not equivalent + for (int i = 0; i < size; i+=2) { + long v0 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0); + long v1 = m.getAtIndex(ValueLayout.JAVA_LONG, i + i2 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i1 + 0, v0 + 1); + m.setAtIndex(ValueLayout.JAVA_LONG, i + i2 + 1, v1 + 1); + } + return new Object[]{ m }; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR_L, "> 0", + IRNode.ADD_VL, "> 0", + IRNode.STORE_VECTOR, "> 0"}, + applyIfPlatform = {"64-bit", "true"}, + applyIf = {"AlignVector", "false"}, + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static Object[] testMemorySegmentLInvarL3f(MemorySegment m, long invar1, long invar2, long invar3, int size) { + long i1 = -invar1 + invar2 + invar3; + long i2 = invar2 + invar3 - invar1; // equivalent + for (int i = 0; i < size; i++) { + // Scale the index manually + long v = m.get(ValueLayout.JAVA_LONG, 8 * (i + i1)); + m.set(ValueLayout.JAVA_LONG, 8 * (i + i2), v + 1); + } + return new Object[]{ m }; + } + + @Test + // Traversal through AddI would explode in exponentially many paths, exhausing the node limit. + // For this, we have a traversal size limit. + static Object[] testLargeInvariantSum(byte[] a, int invar1, int invar2, int invar3, int size) { + int e = invar1; + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + e = ((e + invar2) + (e + invar3)); + for (int i = 0; i < size; i++) { + a[i + e] += 1; + } + return new Object[]{ a }; + } +} From 0baa1f9f25f9c0a9d27353a247409f0bc509259a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 2 Dec 2024 08:40:44 +0100 Subject: [PATCH 108/130] fix up print --- src/hotspot/share/opto/mempointer.hpp | 6 +++--- src/hotspot/share/opto/vectorization.cpp | 25 +++++++++++++++++------- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 822c5fdf3ab0c..98443721d3582 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -599,15 +599,15 @@ class MemPointer : public StackObj { void print_on(outputStream* st) const { switch (_kind) { case Object: - tty->print("native "); + tty->print("object "); tty->print("%d %s", _base->_idx, _base->Name()); break; case Native: - tty->print("native "); + tty->print("native "); tty->print("%d %s", _base->_idx, _base->Name()); break; default: - tty->print("native"); + tty->print("unknown "); }; } #endif diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index a61f3a3e0b2e5..309debb547809 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -401,16 +401,27 @@ void VPointer::print_on(outputStream* st, bool end_with_cr) const { return; } - st->print("size: %2d, base: ", size()); - _mem_pointer.base().print_on(st); - st->print(", form: "); - _mem_pointer.print_form_on(st); - st->print(", invar_summands: "); + st->print("size: %2d, %s, ", size(), + _mem_pointer.base().is_object() ? "object" : "native"); + + Node* base = _mem_pointer.base().object_or_native(); + tty->print("base(%d %s) + con(%3d) + iv_scale(%3d) * iv + invar(", + base->_idx, base->Name(), + _mem_pointer.con().value(), + _iv_scale); + + int count = 0; for_each_invar_summand([&] (const MemPointerSummand& s) { + if (count > 0) { + st->print(" + "); + } s.print_on(tty); - st->print(","); + count++; }); - st->print("]"); + if (count == 0) { + st->print("0"); + } + st->print(")]"); if (end_with_cr) { st->cr(); } } #endif From 4b3c7d2970026a3a8c55a90d6c1f0c46c8998d35 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 2 Dec 2024 10:32:15 +0100 Subject: [PATCH 109/130] rename --- src/hotspot/share/opto/vectorization.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 309debb547809..9d7f6e815e23b 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -209,9 +209,9 @@ void VLoopVPointers::print() const { tty->print_cr("\nVLoopVPointers::print:"); _body.for_each_mem([&] (const MemNode* mem, int bb_idx) { - const VPointer& xp = vpointer(mem); + const VPointer& p = vpointer(mem); tty->print(" "); - xp.print_on(tty); + p.print_on(tty); }); } #endif From 4ef7cee9a4b1e9c3765a2b3942ab82499375e3f8 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 13 Dec 2024 07:21:34 +0100 Subject: [PATCH 110/130] fix printing --- src/hotspot/share/opto/mempointer.hpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 98443721d3582..f9801a977cc70 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -599,15 +599,15 @@ class MemPointer : public StackObj { void print_on(outputStream* st) const { switch (_kind) { case Object: - tty->print("object "); - tty->print("%d %s", _base->_idx, _base->Name()); + st->print("object "); + st->print("%d %s", _base->_idx, _base->Name()); break; case Native: - tty->print("native "); - tty->print("%d %s", _base->_idx, _base->Name()); + st->print("native "); + st->print("%d %s", _base->_idx, _base->Name()); break; default: - tty->print("unknown "); + st->print("unknown "); }; } #endif From b64f929591d93a417023c8d1b1c96c066e06e1b5 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Fri, 3 Jan 2025 09:24:25 +0100 Subject: [PATCH 111/130] copyright 2025 --- src/hotspot/share/classfile/vmSymbols.hpp | 2 +- src/hotspot/share/opto/memnode.cpp | 2 +- src/hotspot/share/opto/mempointer.cpp | 2 +- src/hotspot/share/opto/mempointer.hpp | 2 +- src/hotspot/share/opto/noOverflowInt.hpp | 2 +- src/hotspot/share/opto/superword.cpp | 2 +- src/hotspot/share/opto/superword.hpp | 2 +- src/hotspot/share/opto/superwordVTransformBuilder.cpp | 2 +- src/hotspot/share/opto/traceAutoVectorizationTag.hpp | 2 +- src/hotspot/share/opto/traceMergeStoresTag.hpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 2 +- src/hotspot/share/opto/vtransform.cpp | 2 +- src/hotspot/share/opto/vtransform.hpp | 2 +- .../jtreg/compiler/loopopts/superword/TestAlignVector.java | 2 +- .../compiler/loopopts/superword/TestEquivalentInvariants.java | 2 +- .../compiler/loopopts/superword/TestLargeScaleAndStride.java | 2 +- .../jtreg/compiler/loopopts/superword/TestMemorySegment.java | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/src/hotspot/share/classfile/vmSymbols.hpp b/src/hotspot/share/classfile/vmSymbols.hpp index 16321aaae1d16..6859ce4249121 100644 --- a/src/hotspot/share/classfile/vmSymbols.hpp +++ b/src/hotspot/share/classfile/vmSymbols.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index f9f1135b33ad3..99a68aa690c33 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 1997, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2024, Alibaba Group Holding Limited. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index b1aa584dfe618..9c69bd5781b30 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index f9801a977cc70..ad571707b41d8 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp index 6e14e446e7bec..2cc51ff678fb8 100644 --- a/src/hotspot/share/opto/noOverflowInt.hpp +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index c6b50d690616f..aea2e0bfdd90a 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 3d10a01ecc623..2b971cd9a03f8 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2007, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/superwordVTransformBuilder.cpp b/src/hotspot/share/opto/superwordVTransformBuilder.cpp index dd1bd2b851f80..2aca5398d6010 100644 --- a/src/hotspot/share/opto/superwordVTransformBuilder.cpp +++ b/src/hotspot/share/opto/superwordVTransformBuilder.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp index 5d7f0875ef9dd..0c08777c90c4b 100644 --- a/src/hotspot/share/opto/traceAutoVectorizationTag.hpp +++ b/src/hotspot/share/opto/traceAutoVectorizationTag.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/traceMergeStoresTag.hpp b/src/hotspot/share/opto/traceMergeStoresTag.hpp index 68969cd5dc0d0..214173c02f7dd 100644 --- a/src/hotspot/share/opto/traceMergeStoresTag.hpp +++ b/src/hotspot/share/opto/traceMergeStoresTag.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 9d7f6e815e23b..4e00a5f6ba78d 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2023, Arm Limited. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 8010b12e2ddb3..22d7d6e00b9bc 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2023, Arm Limited. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 725b3ad153595..3377029b3476b 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/src/hotspot/share/opto/vtransform.hpp b/src/hotspot/share/opto/vtransform.hpp index 4b5346d5d9deb..4fc68c7b4dfc2 100644 --- a/src/hotspot/share/opto/vtransform.hpp +++ b/src/hotspot/share/opto/vtransform.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java index 4e006c575e2f4..b3a668da27ca2 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestAlignVector.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java index 074071e96d218..9abe3710aa83f 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java index 850285fa5046f..0b3963867bf62 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestLargeScaleAndStride.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java index 4040527a58ecb..195fc0034d9a1 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From 2cf3e8753c569593901a9a552caaea47a38ca6a9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 07:45:49 +0100 Subject: [PATCH 112/130] for vnkozlov part 1 --- src/hotspot/share/opto/mempointer.hpp | 12 ++++++------ src/hotspot/share/opto/noOverflowInt.hpp | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index ad571707b41d8..b680acd001c74 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -617,24 +617,24 @@ class MemPointer : public StackObj { }; private: - NOT_PRODUCT( const TraceMemPointer& _trace; ) MemPointerSummand _summands[SUMMANDS_SIZE]; const NoOverflowInt _con; const Base _base; const jint _size; + NOT_PRODUCT( const TraceMemPointer& _trace; ) // Default / trivial: pointer = 0 + 1 * pointer MemPointer(Node* pointer, const jint size NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) _con(NoOverflowInt(0)), _base(Base()), _size(size) + NOT_PRODUCT(COMMA _trace(trace)) { assert(pointer != nullptr, "pointer must be non-null"); _summands[0] = MemPointerSummand(pointer, NoOverflowInt(1)); - assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); + assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "sanity: no vector is expected to be larger"); } // pointer = SUM(SUMMANDS) + con @@ -643,10 +643,10 @@ class MemPointer : public StackObj { const NoOverflowInt& con, const jint size NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) _con(con), _base(Base::make(pointer, summands)), _size(size) + NOT_PRODUCT(COMMA _trace(trace)) { assert(!_con.is_NaN(), "non-NaN constant"); assert(summands.length() <= SUMMANDS_SIZE, "summands must fit"); @@ -674,7 +674,7 @@ class MemPointer : public StackObj { } assert(pos == summands.length(), "copied all summands"); - assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "valid size"); + assert(1 <= _size && _size <= 2048 && is_power_of_2(_size), "sanity: no vector is expected to be larger"); } // Mutated copy. @@ -682,10 +682,10 @@ class MemPointer : public StackObj { MemPointer(const MemPointer& old, const NoOverflowInt new_con, const jint new_size) : - NOT_PRODUCT(_trace(old._trace) COMMA) _con(new_con), _base(old.base()), _size(new_size) + NOT_PRODUCT(COMMA _trace(old._trace)) { assert(!_con.is_NaN(), "non-NaN constant"); for (int i = 0; i < SUMMANDS_SIZE; i++) { diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp index 2cc51ff678fb8..8eca8e6bece0b 100644 --- a/src/hotspot/share/opto/noOverflowInt.hpp +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -102,6 +102,7 @@ class NoOverflowInt { } static int cmp(const NoOverflowInt& a, const NoOverflowInt& b) { + // Order NaN (overflow, uninitialized, etc) after non-NaN. if (a.is_NaN()) { return b.is_NaN() ? 0 : 1; } else if (b.is_NaN()) { From 40090281ae367a73c653bb53f58e8ce35394e230 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 08:01:41 +0100 Subject: [PATCH 113/130] for vnkozlov part 2 --- src/hotspot/share/opto/memnode.cpp | 4 ++-- src/hotspot/share/opto/mempointer.cpp | 6 +++--- src/hotspot/share/opto/mempointer.hpp | 13 ++++++++++--- src/hotspot/share/opto/vectorization.hpp | 6 +++--- .../superword/TestEquivalentInvariants.java | 2 +- 5 files changed, 19 insertions(+), 12 deletions(-) diff --git a/src/hotspot/share/opto/memnode.cpp b/src/hotspot/share/opto/memnode.cpp index 99a68aa690c33..fe504e0d1f39f 100644 --- a/src/hotspot/share/opto/memnode.cpp +++ b/src/hotspot/share/opto/memnode.cpp @@ -2947,8 +2947,8 @@ bool MergePrimitiveStores::is_adjacent_pair(const StoreNode* use_store, const St is_trace_pointer_adjacency(), true); #endif - const MemPointer pointer_use(NOT_PRODUCT(trace COMMA) use_store); - const MemPointer pointer_def(NOT_PRODUCT(trace COMMA) def_store); + const MemPointer pointer_use(use_store NOT_PRODUCT(COMMA trace)); + const MemPointer pointer_def(def_store NOT_PRODUCT(COMMA trace)); return pointer_def.is_adjacent_to_and_before(pointer_use); } diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 9c69bd5781b30..7084dd5257fa3 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -30,9 +30,9 @@ MemPointerParserCallback MemPointerParserCallback::_empty; -MemPointer::MemPointer(NOT_PRODUCT(const TraceMemPointer& trace COMMA) - const MemNode* mem, - MemPointerParserCallback& callback) : +MemPointer::MemPointer(const MemNode* mem, + MemPointerParserCallback& callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : MemPointer(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) mem, callback)) {} diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index b680acd001c74..ed39111d0ba2f 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -695,9 +695,16 @@ class MemPointer : public StackObj { public: // Parse pointer of MemNode. Delegates to MemPointerParser::parse. - MemPointer(NOT_PRODUCT(const TraceMemPointer& trace COMMA) - const MemNode* mem, - MemPointerParserCallback& callback = MemPointerParserCallback::empty()); + // callback: receives a callback for every decomposed (inner) node + // of the pointer expression. + MemPointer(const MemNode* mem, + MemPointerParserCallback& callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)); + + // Parse pointer of MemNode. Delegates to MemPointerParser::parse. + MemPointer(const MemNode* mem + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : + MemPointer(mem, MemPointerParserCallback::empty() NOT_PRODUCT(COMMA trace)) {} static MemPointer make_trivial(Node* pointer, const jint size diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 22d7d6e00b9bc..f6b8e1c9785ab 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -744,9 +744,9 @@ class VPointer : public ArenaObj { const VLoop& vloop, MemPointerParserCallback& callback = MemPointerParserCallback::empty()) : VPointer(vloop, - MemPointer(NOT_PRODUCT(vloop.mptrace() COMMA) - mem, - callback)) + MemPointer(mem, + callback + NOT_PRODUCT(COMMA vloop.mptrace()))) { #ifndef PRODUCT if (vloop.mptrace().is_trace_parsing()) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java index 9abe3710aa83f..0d3dbd237078f 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, 2025, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it From 230d7f14e7a62ee113964c3db763b4d9d9270b60 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 08:13:42 +0100 Subject: [PATCH 114/130] for vnkozlov part 3 --- src/hotspot/share/opto/mempointer.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index ed39111d0ba2f..16ad596473670 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -823,7 +823,6 @@ class MemPointerParser : public StackObj { // Resulting decomposed-form. MemPointer _mem_pointer; -public: MemPointerParser(const MemNode* mem, MemPointerParserCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : @@ -832,6 +831,7 @@ class MemPointerParser : public StackObj { _con(NoOverflowInt(0)), _mem_pointer(parse(callback)) {} +public: static MemPointer parse(NOT_PRODUCT(const TraceMemPointer& trace COMMA) const MemNode* mem, MemPointerParserCallback& callback = MemPointerParserCallback::empty()) { From 84e8ce0a3df34e919e5ee7262bd21ef05fbc006a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 08:21:49 +0100 Subject: [PATCH 115/130] for vnkozlov part 4 --- src/hotspot/share/opto/mempointer.cpp | 6 +++--- src/hotspot/share/opto/mempointer.hpp | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 7084dd5257fa3..550c69dde5043 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -33,9 +33,9 @@ MemPointerParserCallback MemPointerParserCallback::_empty; MemPointer::MemPointer(const MemNode* mem, MemPointerParserCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - MemPointer(MemPointerParser::parse(NOT_PRODUCT(trace COMMA) - mem, - callback)) {} + MemPointer(MemPointerParser::parse(mem, + callback + NOT_PRODUCT(COMMA trace))) {} // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 16ad596473670..db499ec02990d 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -832,9 +832,9 @@ class MemPointerParser : public StackObj { _mem_pointer(parse(callback)) {} public: - static MemPointer parse(NOT_PRODUCT(const TraceMemPointer& trace COMMA) - const MemNode* mem, - MemPointerParserCallback& callback = MemPointerParserCallback::empty()) { + static MemPointer parse(const MemNode* mem, + MemPointerParserCallback& callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { assert(mem->is_Store() || mem->is_Load(), "only stores and loads are allowed"); ResourceMark rm; MemPointerParser parser(mem, callback NOT_PRODUCT(COMMA trace)); From 4d857687f108291c8727a05091a04aeaa33dfe1a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 08:26:56 +0100 Subject: [PATCH 116/130] for vnkozlov part 5 --- src/hotspot/share/opto/mempointer.cpp | 11 ++++++----- src/hotspot/share/opto/mempointer.hpp | 8 +++----- 2 files changed, 9 insertions(+), 10 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 550c69dde5043..9240864024f70 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -39,7 +39,8 @@ MemPointer::MemPointer(const MemNode* mem, // Recursively parse the pointer expression with a DFS all-path traversal // (i.e. with node repetitions), starting at the pointer. -MemPointer MemPointerParser::parse(MemPointerParserCallback& callback) { +MemPointer MemPointerParser::parse(MemPointerParserCallback& callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)) { assert(_worklist.is_empty(), "no prior parsing"); assert(_summands.is_empty(), "no prior parsing"); @@ -55,14 +56,14 @@ MemPointer MemPointerParser::parse(MemPointerParserCallback& callback) { while (_worklist.is_nonempty()) { // Bail out if the graph is too complex. if (traversal_count++ > 1000) { - return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); } parse_sub_expression(_worklist.pop(), callback); } // Bail out if there is a constant overflow. if (_con.is_NaN()) { - return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); } // Sorting by variable idx means that all summands with the same variable are consecutive. @@ -83,7 +84,7 @@ MemPointer MemPointerParser::parse(MemPointerParserCallback& callback) { } // Bail out if scale is NaN. if (scale.is_NaN()) { - return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make_trivial(pointer, size NOT_PRODUCT(COMMA trace)); } // Keep summands with non-zero scale. if (!scale.is_zero()) { @@ -92,7 +93,7 @@ MemPointer MemPointerParser::parse(MemPointerParserCallback& callback) { } _summands.trunc_to(pos_put); - return MemPointer::make(pointer, _summands, _con, size NOT_PRODUCT(COMMA _trace)); + return MemPointer::make(pointer, _summands, _con, size NOT_PRODUCT(COMMA trace)); } // Parse a sub-expression of the pointer, starting at the current summand. We parse the diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index db499ec02990d..798afe695e610 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -811,8 +811,6 @@ class MemPointer : public StackObj { class MemPointerParser : public StackObj { private: - NOT_PRODUCT( const TraceMemPointer& _trace; ) - const MemNode* _mem; // Internal data-structures for parsing. @@ -826,10 +824,9 @@ class MemPointerParser : public StackObj { MemPointerParser(const MemNode* mem, MemPointerParserCallback& callback NOT_PRODUCT(COMMA const TraceMemPointer& trace)) : - NOT_PRODUCT(_trace(trace) COMMA) _mem(mem), _con(NoOverflowInt(0)), - _mem_pointer(parse(callback)) {} + _mem_pointer(parse(callback NOT_PRODUCT(COMMA trace))) {} public: static MemPointer parse(const MemNode* mem, @@ -856,7 +853,8 @@ class MemPointerParser : public StackObj { private: const MemPointer& mem_pointer() const { return _mem_pointer; } - MemPointer parse(MemPointerParserCallback& callback); + MemPointer parse(MemPointerParserCallback& callback + NOT_PRODUCT(COMMA const TraceMemPointer& trace)); void parse_sub_expression(const MemPointerSummand& summand, MemPointerParserCallback& callback); static bool sub_expression_has_native_base_candidate(Node* n); From ec938c31b8e5d112edec9948021577a023b4ad8c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 14 Jan 2025 09:10:04 +0100 Subject: [PATCH 117/130] for vnkozlov part 6 --- src/hotspot/share/opto/mempointer.cpp | 3 ++- src/hotspot/share/opto/superword.cpp | 15 +++++---------- src/hotspot/share/opto/superword.hpp | 8 ++++++++ src/hotspot/share/opto/vtransform.cpp | 14 ++++++-------- 4 files changed, 21 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 9240864024f70..7806d47f62105 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -218,8 +218,9 @@ void MemPointerParser::parse_sub_expression(const MemPointerSummand& summand, Me bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { // BFS over the expression. + // Allocate sufficient space in worklist for 100 limit below. ResourceMark rm; - GrowableArray worklist; + GrowableArray worklist(102); worklist.append(start); for (int i = 0; i < worklist.length(); i++) { Node* n = worklist.at(i); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index aea2e0bfdd90a..56dbecd9e3c30 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -496,14 +496,10 @@ bool SuperWord::SLP_extract() { return schedule_and_apply(); } -// We use two comparisons, because a subtraction could underflow. -#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ - if (a < b) { return -1; } \ - if (a > b) { return 1; } - int SuperWord::MemOp::cmp_by_group(MemOp* a, MemOp* b) { // Opcode - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->mem()->Opcode(), b->mem()->Opcode()); + int c_Opcode = cmp_code(a->mem()->Opcode(), b->mem()->Opcode()); + if (c_Opcode != 0) { return c_Opcode; } // VPointer summands return MemPointer::cmp_summands(a->vpointer().mem_pointer(), @@ -518,11 +514,10 @@ int SuperWord::MemOp::cmp_by_group_and_con_and_original_index(MemOp* a, MemOp* b // VPointer con jint a_con = a->vpointer().mem_pointer().con().value(); jint b_con = b->vpointer().mem_pointer().con().value(); - RETURN_CMP_VALUE_IF_NOT_EQUAL(a_con, b_con); - - RETURN_CMP_VALUE_IF_NOT_EQUAL(a->original_index(), b->original_index()); + int c_con = cmp_code(a_con, b_con); + if (c_con != 0) { return c_con; } - return 0; + return cmp_code(a->original_index(), b->original_index()); } // Find the "seed" memops pairs. These are pairs that we strongly suspect would lead to vectorization. diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 2b971cd9a03f8..e66b57358955b 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -584,6 +584,14 @@ class SuperWord : public ResourceObj { static int cmp_by_group(MemOp* a, MemOp* b); static int cmp_by_group_and_con_and_original_index(MemOp* a, MemOp* b); + + // We use two comparisons, because a subtraction could underflow. + template + static int cmp_code(T a, T b) { + if (a < b) { return -1; } + if (a > b) { return 1; } + return 0; + } }; void create_adjacent_memop_pairs(); void collect_valid_memops(GrowableArray& memops); diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 3377029b3476b..8ca5839775bae 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -144,11 +144,6 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { } #endif -// We use two comparisons, because a subtraction could underflow. -#define RETURN_CMP_VALUE_IF_NOT_EQUAL(a, b) \ - if (a < b) { return -1; } \ - if (a > b) { return 1; } - // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. // It wraps a VPointer. The VPointer have an iv_offset applied, which // simulates a virtual unrolling. They represent the memory region: @@ -184,9 +179,12 @@ class VMemoryRegion : public ResourceObj { int cmp_group = cmp_for_sort_by_group(*r1, *r2); if (cmp_group != 0) { return cmp_group; } - RETURN_CMP_VALUE_IF_NOT_EQUAL((*r1)->vpointer().con(), - (*r2)->vpointer().con()); - return 0; // equal + // We use two comparisons, because a subtraction could underflow. + jint con1 = (*r1)->vpointer().con(); + jint con2 = (*r2)->vpointer().con(); + if (con1 < con2) { return -1; } + if (con1 > con2) { return 1; } + return 0; } enum Aliasing { DIFFERENT_GROUP, BEFORE, EXACT_OVERLAP, PARTIAL_OVERLAP, AFTER }; From 27eba3d88c6a5ca4f743c80b9cc590f372aa1bd9 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 15 Jan 2025 07:36:40 +0100 Subject: [PATCH 118/130] More fixes for vnkozlov --- src/hotspot/share/opto/noOverflowInt.hpp | 7 ++++++- .../loopopts/superword/TestMemorySegment.java | 14 ++------------ 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/hotspot/share/opto/noOverflowInt.hpp b/src/hotspot/share/opto/noOverflowInt.hpp index 8eca8e6bece0b..96473407cca11 100644 --- a/src/hotspot/share/opto/noOverflowInt.hpp +++ b/src/hotspot/share/opto/noOverflowInt.hpp @@ -101,8 +101,13 @@ class NoOverflowInt { return a.value() % b.value() == 0; } + // This "cmp" is used for sort only. + // Note: the NaN semantics are different from floating arithmetic NaNs! + // - Smaller non-NaN are before larger non-NaN. + // - Any non-NaN are before NaN. + // - NaN is equal to NaN. + // Note: NaN indicate overflow, uninitialized, etc. static int cmp(const NoOverflowInt& a, const NoOverflowInt& b) { - // Order NaN (overflow, uninitialized, etc) after non-NaN. if (a.is_NaN()) { return b.is_NaN() ? 0 : 1; } else if (b.is_NaN()) { diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java index 195fc0034d9a1..38533f9d07217 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java @@ -645,14 +645,9 @@ static Object[] testLongLoop_longIndex_longInvar_sameAdr_byte(MemorySegment a, l } @Test - // @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - // IRNode.ADD_VB, "= 0", - // IRNode.STORE_VECTOR, "= 0"}, - // applyIfPlatform = {"64-bit", "true"}, - // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. // See: JDK-8330274 - // Interestingly, it now passes for native, but not for objects. + // Interestingly, it now vectorizes for native, but not for arrays. static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) { for (long i = 0; i < a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); @@ -664,14 +659,9 @@ static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) } @Test - // @IR(counts = {IRNode.LOAD_VECTOR_B, "= 0", - // IRNode.ADD_VB, "= 0", - // IRNode.STORE_VECTOR, "= 0"}, - // applyIfPlatform = {"64-bit", "true"}, - // applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. // See: JDK-8330274 - // Interestingly, it now passes for native, but not for objects. + // Interestingly, it now vectorizes for native, but not for arrays. static Object[] testLongLoop_longIndex_longInvar_byte(MemorySegment a, long invar) { for (long i = 0; i < a.byteSize(); i++) { long adr1 = (long)(i) + (long)(invar); From 5bca8e20bda73741d15048a70662e1581765d243 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 15 Jan 2025 15:06:05 +0100 Subject: [PATCH 119/130] Apply suggestions from code review by Christian Co-authored-by: Christian Hagedorn --- src/hotspot/share/opto/vectorization.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index f6b8e1c9785ab..3190d11ad28a3 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -86,7 +86,7 @@ class VLoop : public StackObj { CountedLoopEndNode* _pre_loop_end; // cache access to pre-loop for main loops only NOT_PRODUCT(VTrace _vtrace;) - NOT_PRODUCT(TraceMemPointer _mptrace; ) + NOT_PRODUCT(TraceMemPointer _mptrace;) static constexpr char const* FAILURE_ALREADY_VECTORIZED = "loop already vectorized"; static constexpr char const* FAILURE_UNROLL_ONLY = "loop only wants to be unrolled"; @@ -187,7 +187,7 @@ class VLoop : public StackObj { // Some nodes must be pre-loop invariant, so that they can be used for conditions // before or inside the pre-loop. For example, alignment of main-loop vector - // memops must be acheived in the pre-loop, via the exit check in the pre-loop. + // memops must be achieved in the pre-loop, via the exit check in the pre-loop. bool is_pre_loop_invariant(Node* n) const { // Must be in the main-loop, otherwise we can't access the pre-loop. // This fails during SuperWord::unrolling_analysis, but that is ok. @@ -698,7 +698,7 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// VPointer wraps the MemPointer to the use in a loop: +// VPointer wraps the MemPointer for the use in a loop: // // pointer = SUM(summands) + con // @@ -719,7 +719,7 @@ class VLoopAnalyzer : StackObj { // If we find a summand where the variable is the iv, we set iv_scale to the // corresponding scale. If there is no such summand, then we know that the // pointer does not depend on the iv, since otherwise there would have to be -// a summand where its variable it main-loop variant. +// a summand where its variable is main-loop variant. // class VPointer : public ArenaObj { private: @@ -801,9 +801,9 @@ class VPointer : public ArenaObj { // Accessors bool is_valid() const { return _is_valid; } - const MemPointer& mem_pointer() const { assert(_is_valid, ""); return _mem_pointer; } - jint size() const { assert(_is_valid, ""); return mem_pointer().size(); } - jint iv_scale() const { assert(_is_valid, ""); return _iv_scale; } + const MemPointer& mem_pointer() const { assert(_is_valid, "must be valid"); return _mem_pointer; } + jint size() const { assert(_is_valid, "must be valid"); return mem_pointer().size(); } + jint iv_scale() const { assert(_is_valid, "must be valid"); return _iv_scale; } jint con() const { return mem_pointer().con().value(); } template @@ -918,7 +918,7 @@ class VPointer : public ArenaObj { } } - // In the pointer analysis, and especially the AlignVector, analysis we assume that + // In the pointer analysis, and especially the AlignVector analysis, we assume that // stride and scale are not too large. For example, we multiply "iv_scale * iv_stride", // and assume that this does not overflow the int range. We also take "abs(iv_scale)" // and "abs(iv_stride)", which would overflow for min_int = -(2^31). Still, we want From 18c7933e7d3e1f55fea30354bb3d14ab8905c20c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 15 Jan 2025 15:57:18 +0100 Subject: [PATCH 120/130] better comments for Christian --- src/hotspot/share/opto/mempointer.hpp | 5 +++ src/hotspot/share/opto/vectorization.hpp | 50 ++++++++++++++++++------ 2 files changed, 42 insertions(+), 13 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 798afe695e610..9f3ae90f1682b 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -573,6 +573,11 @@ class MemPointer : public StackObj { // variables. It should be rare that we have more than 9 variables. static const int SUMMANDS_SIZE = 10; + // A base can be: + // - Known: + // - On-heap: Object + // - Off-heap: Native + // - Unknown class Base : public StackObj { private: enum Kind { Unknown, Object, Native }; diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 3190d11ad28a3..87b344c73f79d 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -698,28 +698,52 @@ class VLoopAnalyzer : StackObj { VStatus setup_submodules_helper(); }; -// VPointer wraps the MemPointer for the use in a loop: +// Reminder: MemPointer have the form: // // pointer = SUM(summands) + con // -// We define invar_summands as all summands, except those where the variable is -// the base or the loop iv. We can thus write: +// Where every summand in summands has the form: +// +// summand = scale * variable +// +// The VPointer wraps a MemPointer for the use in loops. A "valid" VPointer has +// the form: // // pointer = base + invar + iv_scale * iv + con // // invar = SUM(invar_summands) // -// We have the following components: -// - base: +// Where: +// - base: is the known base of the MemPointer. // on-heap (object base) or off-heap (native base address) -// - invar_summands: -// pre-loop invariant. This is important when we need to memory align a -// pointer using the pre-loop limit. -// - iv and iv_scale: +// - iv and iv_scale: i.e. the iv_summand = iv * iv_scale. // If we find a summand where the variable is the iv, we set iv_scale to the // corresponding scale. If there is no such summand, then we know that the // pointer does not depend on the iv, since otherwise there would have to be // a summand where its variable is main-loop variant. +// - invar_summands: all other summands except base and iv_summand. +// All variables must be pre-loop invariant. This is important when we need +// to memory align a pointer using the pre-loop limit. +// +// A VPointer can be marked "invalid", if some of these conditions are not met, or +// it is unknown if they are met. If a VPointer is marked "invalid", it always +// returns conservative answers to aliasing queries, which means that we do not +// optimize in these cases. For example: +// - is_adjacent_to_and_before: returning true would allow optimizations such as +// packing into vectors. So for "invalid" VPointers +// we always return false (i.e. unknown). +// - never_overlaps_with: returning true would allow optimizations such as +// swapping the order of memops. So for "invalid" VPointers +// we always return false (i.e. unknown). +// +// These are examples where a VPointer becomes "invalid": +// - If the MemPointer does not have the required form for VPointer, +// i.e. if one of these conditions is not met (see init_is_valid): +// - Base must be known. +// - All summands except the iv-summand must be pre-loop invariant. +// - Some restrictions on iv_scale and iv_stride, to avoid overflow in +// alignment computations. +// - If the new con computed in make_with_iv_offset overflows. // class VPointer : public ArenaObj { private: @@ -739,6 +763,10 @@ class VPointer : public ArenaObj { _iv_scale(init_iv_scale()), _is_valid(!must_be_invalid && init_is_valid()) {} + VPointer make_invalid() const { + return VPointer(_vloop, mem_pointer(), true /* must be invalid*/); + } + public: VPointer(const MemNode* mem, const VLoop& vloop, @@ -795,10 +823,6 @@ class VPointer : public ArenaObj { return p; } - VPointer make_invalid() const { - return VPointer(_vloop, mem_pointer(), true /* must be invalid*/); - } - // Accessors bool is_valid() const { return _is_valid; } const MemPointer& mem_pointer() const { assert(_is_valid, "must be valid"); return _mem_pointer; } From 9a388133c80983223c55597140dfb55491bd2972 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 07:18:36 +0100 Subject: [PATCH 121/130] More comments for Christian --- src/hotspot/share/opto/mempointer.cpp | 40 ++++++++++++++++++++++++ src/hotspot/share/opto/mempointer.hpp | 14 ++++++++- src/hotspot/share/opto/vectorization.hpp | 13 +++++--- 3 files changed, 62 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 7806d47f62105..65c1a318025bc 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -538,6 +538,21 @@ bool MemPointer::has_different_object_base_but_otherwise_same_summands_as(const return has_same_summands_as(other, 1); } +// Examples: +// p1 = MemPointer[size=1, base + i + 16] +// p2 = MemPointer[size=1, base + i + 17] +// -> Always at distance 1 +// -> p1 always adjacent and before p2 -> return true +// +// p1 = MemPointer[size=4, x + y + z + 4L * i + 16] +// p2 = MemPointer[size=4, x + y + z + 4L * i + 20] +// -> Always at distance 4 +// -> p1 always adjacent and before p2 -> return true +// +// p1 = MemPointer[size=4, base1 + 4L * i1 + 16] +// p2 = MemPointer[size=4, base2 + 4L * i2 + 20] +// -> Have differing summands, distance is unknown +// -> Unknown if adjacent at runtime -> return false bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); const bool is_adjacent = aliasing.is_always_at_distance(_size); @@ -554,6 +569,31 @@ bool MemPointer::is_adjacent_to_and_before(const MemPointer& other) const { return is_adjacent; } +// Examples: +// p1 = MemPointer[size=1, base + i + 16] +// p2 = MemPointer[size=1, base + i + 17] +// -> Always at distance 1 +// -> Can never overlap -> return true +// +// p1 = MemPointer[size=1, base + i + 16] +// p2 = MemPointer[size=1, base + i + 16] +// -> Always at distance 0 +// -> Always have exact overlap -> return false +// +// p1 = MemPointer[size=4, x + y + z + 4L * i + 16] +// p2 = MemPointer[size=4, x + y + z + 4L * i + 56] +// -> Always at distance 40 +// -> Can never overlap -> return true +// +// p1 = MemPointer[size=8, x + y + z + 4L * i + 16] +// p2 = MemPointer[size=8, x + y + z + 4L * i + 20] +// -> Always at distance 4 +// -> Always have partial overlap -> return false +// +// p1 = MemPointer[size=4, base1 + 4L * i1 + 16] +// p2 = MemPointer[size=4, base2 + 4L * i2 + 20] +// -> Have differing summands, distance is unknown +// -> Unknown if overlap at runtime -> return false bool MemPointer::never_overlaps_with(const MemPointer& other) const { const MemPointerAliasing aliasing = get_aliasing_with(other NOT_PRODUCT( COMMA _trace )); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 9f3ae90f1682b..7eb3d17ad7e63 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -558,13 +558,25 @@ class MemPointerParserCallback : public StackObj { static MemPointerParserCallback& empty() { return _empty; } }; -// Decomposed form of the pointer sub-expression of "pointer". +// A MemPointer points to a region in memory, starting at a "pointer", and extending +// for "size" bytes: +// +// [pointer, pointer + size) +// +// Where the "pointer" is decomposed into the following form: // // pointer = SUM(summands) + con +// pointer = SUM(scale_i * variable_i) + con +// +// Where SUM() adds all "scale_i * variable_i" for each i together. // // Node: if the base is known, then it is in the 0th summand. A base can be: // - on-heap / object: base().object() // - off-heap / native: base().native() +// +// pointer = scale_0 * variable_0 + scale_1 * scale_1 + ... + con +// pointer = 1 * base + scale_1 * scale_1 + ... + con +// class MemPointer : public StackObj { public: // We limit the number of summands to 10. This is just a best guess, and not at this diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 87b344c73f79d..3b9fad705008e 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -843,8 +843,8 @@ class VPointer : public ArenaObj { // Greatest common factor among the scales of the invar_summands. // Out of simplicity, we only factor out positive powers-of-2, - // between 1 and ObjectAlignmentInBytes. If the invar is empty, - // i.e. there is no summand in invar_summands, we return 0. + // between (inclusive) 1 and ObjectAlignmentInBytes. If the invar + // is empty, i.e. there is no summand in invar_summands, we return 0. jint compute_invar_factor() const { jint factor = ObjectAlignmentInBytes; int invar_count = 0; @@ -865,12 +865,15 @@ class VPointer : public ArenaObj { return invar_count; } + // If we have the same invar_summands, and the same iv summand with the same iv_scale, + // then all summands except the base must be the same. bool has_same_invar_and_iv_scale_as(const VPointer& other) const { - // If we have the same invar_summands, and the same iv summand with the same iv_scale, - // then all summands except the base must be the same. return mem_pointer().has_same_non_base_summands_as(other.mem_pointer()); } + + // Delegate to MemPointer::is_adjacent_to_and_before, but guard for invalid cases + // where we must return a conservative answer: unknown adjacency, return false. bool is_adjacent_to_and_before(const VPointer& other) const { if (!is_valid() || !other.is_valid()) { #ifndef PRODUCT @@ -883,6 +886,8 @@ class VPointer : public ArenaObj { return mem_pointer().is_adjacent_to_and_before(other.mem_pointer()); } + // Delegate to MemPointer::never_overlaps_with, but guard for invalid cases + // where we must return a conservative answer: unknown overlap, return false. bool never_overlaps_with(const VPointer& other) const { if (!is_valid() || !other.is_valid()) { #ifndef PRODUCT From decf4c8c91851c093ea8c6897217c536c087e3b6 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 07:28:02 +0100 Subject: [PATCH 122/130] remove dead code left behind after JDK-8311691 --- src/hotspot/share/opto/vectorization.hpp | 45 ------------------------ 1 file changed, 45 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 3b9fad705008e..30dc64a3735f9 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -974,51 +974,6 @@ class VPointer : public ArenaObj { } }; -// Vector element size statistics for loop vectorization with vector masks -class VectorElementSizeStats { - private: - static const int NO_SIZE = -1; - static const int MIXED_SIZE = -2; - int* _stats; - - public: - VectorElementSizeStats(Arena* a) : _stats(NEW_ARENA_ARRAY(a, int, 4)) { - clear(); - } - - void clear() { memset(_stats, 0, sizeof(int) * 4); } - - void record_size(int size) { - assert(1 <= size && size <= 8 && is_power_of_2(size), "Illegal size"); - _stats[exact_log2(size)]++; - } - - int count_size(int size) { - assert(1 <= size && size <= 8 && is_power_of_2(size), "Illegal size"); - return _stats[exact_log2(size)]; - } - - int smallest_size() { - for (int i = 0; i <= 3; i++) { - if (_stats[i] > 0) return (1 << i); - } - return NO_SIZE; - } - - int largest_size() { - for (int i = 3; i >= 0; i--) { - if (_stats[i] > 0) return (1 << i); - } - return NO_SIZE; - } - - int unique_size() { - int small = smallest_size(); - int large = largest_size(); - return (small == large) ? small : MIXED_SIZE; - } -}; - // When alignment is required, we must adjust the pre-loop iteration count pre_iter, // such that the address is aligned for any main_iter >= 0: // From 78857e82306f7b92d66c189bc3bbf60049836c1f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 07:52:28 +0100 Subject: [PATCH 123/130] split init_is_valid --- src/hotspot/share/opto/vectorization.hpp | 56 ++++++++++++++---------- 1 file changed, 33 insertions(+), 23 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 30dc64a3735f9..0555624d23aa7 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -915,23 +915,31 @@ class VPointer : public ArenaObj { return 0; } - // Check that all variables are either the iv, or else invariants. + // Check the conditions for a "valid" VPointer. bool init_is_valid() const { - if (!_mem_pointer.base().is_known()) { - // VPointer needs to know if it is native (off-heap) or object (on-heap). - // We may for example have failed to fully decompose the MemPointer, possibly - // because such a decomposition is not considered safe. + return init_is_base_known() && + init_are_non_iv_summands_pre_loop_invariant() && + init_are_scale_and_stride_not_too_large(); + } + + // VPointer needs to know if it is native (off-heap) or object (on-heap). + // We may for example have failed to fully decompose the MemPointer, possibly + // because such a decomposition is not considered safe. + bool init_is_base_known() const { + if (_mem_pointer.base().is_known()) { return true; } + #ifndef PRODUCT - if (_vloop.mptrace().is_trace_parsing()) { - tty->print_cr("VPointer::init_is_valid: base not known."); - } -#endif - return false; + if (_vloop.mptrace().is_trace_parsing()) { + tty->print_cr("VPointer::init_is_valid: base not known."); } +#endif + return false; + } - // All summands, except the iv-summand must be pre-loop invariant. This is necessary - // so that we can use the variables in checks inside or before the pre-loop, e.g. for - // alignment. + // All summands, except the iv-summand must be pre-loop invariant. This is necessary + // so that we can use the variables in checks inside or before the pre-loop, e.g. for + // alignment. + bool init_are_non_iv_summands_pre_loop_invariant() const { for (uint i = 0; i < MemPointer::SUMMANDS_SIZE; i++) { const MemPointerSummand& summand = _mem_pointer.summands_at(i); Node* variable = summand.variable(); @@ -946,16 +954,19 @@ class VPointer : public ArenaObj { return false; } } + return true; + } - // In the pointer analysis, and especially the AlignVector analysis, we assume that - // stride and scale are not too large. For example, we multiply "iv_scale * iv_stride", - // and assume that this does not overflow the int range. We also take "abs(iv_scale)" - // and "abs(iv_stride)", which would overflow for min_int = -(2^31). Still, we want - // to at least allow small and moderately large stride and scale. Therefore, we - // allow values up to 2^30, which is only a factor 2 smaller than the max/min int. - // Normal performance relevant code will have much lower values. And the restriction - // allows us to keep the rest of the autovectorization code much simpler, since we - // do not have to deal with overflows. + // In the pointer analysis, and especially the AlignVector analysis, we assume that + // stride and scale are not too large. For example, we multiply "iv_scale * iv_stride", + // and assume that this does not overflow the int range. We also take "abs(iv_scale)" + // and "abs(iv_stride)", which would overflow for min_int = -(2^31). Still, we want + // to at least allow small and moderately large stride and scale. Therefore, we + // allow values up to 2^30, which is only a factor 2 smaller than the max/min int. + // Normal performance relevant code will have much lower values. And the restriction + // allows us to keep the rest of the autovectorization code much simpler, since we + // do not have to deal with overflows. + bool init_are_scale_and_stride_not_too_large() const { jlong long_iv_scale = _iv_scale; jlong long_iv_stride = _vloop.iv_stride(); jlong max_val = 1 << 30; @@ -969,7 +980,6 @@ class VPointer : public ArenaObj { #endif return false; } - return true; } }; From 91ff3cc9d51b40caf014b764d549161e1b92fb9f Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 07:57:49 +0100 Subject: [PATCH 124/130] rename to has_same_invar_summands_and_iv_scale_as --- src/hotspot/share/opto/vectorization.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 0555624d23aa7..92b3561e7fbc1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -867,7 +867,7 @@ class VPointer : public ArenaObj { // If we have the same invar_summands, and the same iv summand with the same iv_scale, // then all summands except the base must be the same. - bool has_same_invar_and_iv_scale_as(const VPointer& other) const { + bool has_same_invar_summands_and_iv_scale_as(const VPointer& other) const { return mem_pointer().has_same_non_base_summands_as(other.mem_pointer()); } @@ -1160,7 +1160,7 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { const VPointer& p2 = s2->vpointer(); bool both_no_invar = p1.count_invar_summands() == 0 && p2.count_invar_summands() == 0; - if(!both_no_invar && !p1.has_same_invar_and_iv_scale_as(p2)) { + if(!both_no_invar && !p1.has_same_invar_summands_and_iv_scale_as(p2)) { return new EmptyAlignmentSolution("invar alignment term not identical"); } From 51c18b72bf90ffb7ca770675914ec860f89a5d26 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 09:43:17 +0100 Subject: [PATCH 125/130] refactor to has_invar_summands and print invar and fix non-product guards --- src/hotspot/share/opto/superword.cpp | 4 ++-- src/hotspot/share/opto/vectorization.hpp | 29 ++++++++++++++++++------ 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 56dbecd9e3c30..1fe310eac22a0 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2849,8 +2849,8 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { tty->print_cr(" con: %d", con); tty->print(" base:"); base->dump(); - if (p.count_invar_summands() == 0) { - tty->print_cr(" invar: null"); + if (!p.has_invar_summands()) { + tty->print_cr(" invar: none"); } else { tty->print_cr(" invar_summands:"); p.for_each_invar_summand([&] (const MemPointerSummand& s) { diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index 92b3561e7fbc1..a0a4ee98cd0f1 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -857,12 +857,12 @@ class VPointer : public ArenaObj { return invar_count > 0 ? factor : 0; } - int count_invar_summands() const { + bool has_invar_summands() const { int invar_count = 0; for_each_invar_summand([&] (const MemPointerSummand& s) { invar_count++; }); - return invar_count; + return invar_count > 0; } // If we have the same invar_summands, and the same iv summand with the same iv_scale, @@ -1026,7 +1026,7 @@ class AlignmentSolution : public ResourceObj { // Implemented by each subclass virtual const AlignmentSolution* filter(const AlignmentSolution* other) const = 0; - virtual void print() const = 0; + DEBUG_ONLY( virtual void print() const = 0; ) // Compute modulo and ensure that we get a positive remainder static int mod(int i, int q) { @@ -1060,9 +1060,11 @@ class EmptyAlignmentSolution : public AlignmentSolution { return new EmptyAlignmentSolution("empty solution input to filter"); } +#ifndef PRODUCT virtual void print() const override final { tty->print_cr("empty solution: %s", reason()); }; +#endif }; class TrivialAlignmentSolution : public AlignmentSolution { @@ -1083,9 +1085,11 @@ class TrivialAlignmentSolution : public AlignmentSolution { return other; } +#ifndef PRODUCT virtual void print() const override final { tty->print_cr("pre_iter >= 0 (trivial)"); }; +#endif }; class ConstrainedAlignmentSolution : public AlignmentSolution { @@ -1158,8 +1162,8 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { // Use VPointer to do checks on invar and iv_scale: const VPointer& p1 = s1->vpointer(); const VPointer& p2 = s2->vpointer(); - bool both_no_invar = p1.count_invar_summands() == 0 && - p2.count_invar_summands() == 0; + bool both_no_invar = !p1.has_invar_summands() && + !p2.has_invar_summands(); if(!both_no_invar && !p1.has_same_invar_summands_and_iv_scale_as(p2)) { return new EmptyAlignmentSolution("invar alignment term not identical"); } @@ -1200,13 +1204,24 @@ class ConstrainedAlignmentSolution : public AlignmentSolution { return s2; // return the subset } +#ifndef PRODUCT virtual void print() const override final { tty->print("m * q(%d) + r(%d)", _q, _r); - if (_vpointer.count_invar_summands() > 0) { - tty->print(" - invar / (iv_scale(%d) * pre_stride)", _vpointer.iv_scale()); + if (_vpointer.has_invar_summands()) { + tty->print(" - invar("); + int count = 0; + _vpointer.for_each_invar_summand([&] (const MemPointerSummand& s) { + if (count > 0) { + tty->print(" + "); + } + s.print_on(tty); + count++; + }); + tty->print(") / (iv_scale(%d) * pre_stride)", _vpointer.iv_scale()); } tty->print_cr(" [- init / pre_stride], mem_ref[%d]", mem_ref()->_idx); }; +#endif }; // When strict alignment is required (e.g. -XX:+AlignVector), then we must ensure From fcbef42fc36c8e973f6e2ee27149d0f8488fae47 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Jan 2025 17:15:03 +0100 Subject: [PATCH 126/130] Batch 2 for Christian --- src/hotspot/share/opto/mempointer.hpp | 2 +- src/hotspot/share/opto/vectorization.hpp | 36 +++++++++++++----------- 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 7eb3d17ad7e63..32ce67c3ac390 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -570,7 +570,7 @@ class MemPointerParserCallback : public StackObj { // // Where SUM() adds all "scale_i * variable_i" for each i together. // -// Node: if the base is known, then it is in the 0th summand. A base can be: +// Note: if the base is known, then it is in the 0th summand. A base can be: // - on-heap / object: base().object() // - off-heap / native: base().native() // diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index a0a4ee98cd0f1..bdb81af6509f2 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -709,9 +709,10 @@ class VLoopAnalyzer : StackObj { // The VPointer wraps a MemPointer for the use in loops. A "valid" VPointer has // the form: // -// pointer = base + invar + iv_scale * iv + con -// +// pointer = base + invar + iv_summand + con +// with // invar = SUM(invar_summands) +// iv_summand = iv_scale * iv // // Where: // - base: is the known base of the MemPointer. @@ -720,22 +721,13 @@ class VLoopAnalyzer : StackObj { // If we find a summand where the variable is the iv, we set iv_scale to the // corresponding scale. If there is no such summand, then we know that the // pointer does not depend on the iv, since otherwise there would have to be -// a summand where its variable is main-loop variant. +// a summand where its variable is main-loop variant. Note: MemPointer already +// ensures that there is at most one summand per variable, so there is at +// most one summand with iv. // - invar_summands: all other summands except base and iv_summand. // All variables must be pre-loop invariant. This is important when we need // to memory align a pointer using the pre-loop limit. // -// A VPointer can be marked "invalid", if some of these conditions are not met, or -// it is unknown if they are met. If a VPointer is marked "invalid", it always -// returns conservative answers to aliasing queries, which means that we do not -// optimize in these cases. For example: -// - is_adjacent_to_and_before: returning true would allow optimizations such as -// packing into vectors. So for "invalid" VPointers -// we always return false (i.e. unknown). -// - never_overlaps_with: returning true would allow optimizations such as -// swapping the order of memops. So for "invalid" VPointers -// we always return false (i.e. unknown). -// // These are examples where a VPointer becomes "invalid": // - If the MemPointer does not have the required form for VPointer, // i.e. if one of these conditions is not met (see init_is_valid): @@ -745,6 +737,16 @@ class VLoopAnalyzer : StackObj { // alignment computations. // - If the new con computed in make_with_iv_offset overflows. // +// If a VPointer is marked "invalid", it always returns conservative answers to +// aliasing queries, which means that we do not optimize in these cases. +// For example: +// - is_adjacent_to_and_before: returning true would allow optimizations such as +// packing into vectors. So for "invalid" VPointers, +// we always return false (i.e. unknown). +// - never_overlaps_with: returning true would allow optimizations such as +// swapping the order of memops. So for "invalid" VPointers, +// we always return false (i.e. unknown). +// class VPointer : public ArenaObj { private: const VLoop& _vloop; @@ -923,8 +925,8 @@ class VPointer : public ArenaObj { } // VPointer needs to know if it is native (off-heap) or object (on-heap). - // We may for example have failed to fully decompose the MemPointer, possibly - // because such a decomposition is not considered safe. + // We may, for example, have failed to fully decompose the MemPointer, + // possibly because such a decomposition is not considered safe. bool init_is_base_known() const { if (_mem_pointer.base().is_known()) { return true; } @@ -936,7 +938,7 @@ class VPointer : public ArenaObj { return false; } - // All summands, except the iv-summand must be pre-loop invariant. This is necessary + // All summands, except the iv-summand, must be pre-loop invariant. This is necessary // so that we can use the variables in checks inside or before the pre-loop, e.g. for // alignment. bool init_are_non_iv_summands_pre_loop_invariant() const { From a8b79affdc67c7698380184d421d65a4e7d88919 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 18 Jan 2025 06:16:23 +0100 Subject: [PATCH 127/130] Apply suggestions from code review by Christiain Co-authored-by: Christian Hagedorn --- src/hotspot/share/opto/mempointer.cpp | 11 ++++------- src/hotspot/share/opto/mempointer.hpp | 14 +++++++------- src/hotspot/share/opto/vectorization.cpp | 8 ++++---- src/hotspot/share/opto/vtransform.cpp | 2 +- 4 files changed, 16 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index 65c1a318025bc..a1455ace62bff 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -224,7 +224,7 @@ bool MemPointerParser::sub_expression_has_native_base_candidate(Node* start) { worklist.append(start); for (int i = 0; i < worklist.length(); i++) { Node* n = worklist.at(i); - switch(n->Opcode()) { + switch (n->Opcode()) { case Op_AddL: // Traverse to both inputs. worklist.append(n->in(1)); @@ -270,11 +270,8 @@ bool MemPointerParser::is_native_memory_base_candidate(Node* n) { Symbol* field_symbol = field->name()->get_symbol(); Symbol* holder_symbol = field->holder()->name()->get_symbol(); - if (holder_symbol != vmSymbols::jdk_internal_foreign_NativeMemorySegmentImpl() || - field_symbol != vmSymbols::min_name()) { - return false; - } - return true; + return holder_symbol == vmSymbols::jdk_internal_foreign_NativeMemorySegmentImpl() && + field_symbol == vmSymbols::min_name(); } // Check if the decomposition of operation opc is guaranteed to be safe. @@ -604,7 +601,7 @@ bool MemPointer::never_overlaps_with(const MemPointer& other) const { // this >= other + other.size || this + this.size <= other // // Which we can restate as: - // distance <= -other.size || this.size <= distance + // distance <= -other.size || this.size <= distance // const jint distance_lo = -other.size(); const jint distance_hi = size(); diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 32ce67c3ac390..419d9a896491b 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -231,7 +231,7 @@ // or outside it. We also have no guarantee for alignment with such a base address. // Still: we would like to find such a base if possible, and if two pointers are similar (i.e. have the // same summands), we would like to find the same base. Further, it is reasonable to speculatively -// assume that such base addresses are aligned (need to add this speculative check in JDK-8323582). +// assume that such base addresses are aligned (TODO: need to add this speculative check in JDK-8323582). // A base pointer must have scale = 1, and be accepted byMemPointer::is_native_memory_base_candidate. // It can thus be one of these: // (1) CastX2P @@ -240,7 +240,7 @@ // decomposed the CastX2P, but at that point it is even harder to tell what should be a good // candidate for a native memory base. // (2) LoadL from field jdk.internal.foreign.NativeMemorySegmentImpl.min -// This would be preferrable over CastX2P, because it holds the address() of a native +// This would be preferable over CastX2P, because it holds the address() of a native // MemorySegment, i.e. we know it points to the beginning of that MemorySegment. // // ----------------------------------------------------------------------------------------- @@ -607,9 +607,9 @@ class MemPointer : public StackObj { bool is_known() const { return _kind != Unknown; } bool is_object() const { return _kind == Object; } bool is_native() const { return _kind == Native; } - Node* object() const { assert(is_object(), ""); return _base; } - Node* native() const { assert(is_native(), ""); return _base; } - Node* object_or_native() const { assert(is_known(), ""); return _base; } + Node* object() const { assert(is_object(), "unexpected kind"); return _base; } + Node* native() const { assert(is_native(), "unexpected kind"); return _base; } + Node* object_or_native() const { assert(is_known(), "unexpected kind"); return _base; } Node* object_or_native_or_null() const { return _base; } #ifndef PRODUCT @@ -675,7 +675,7 @@ class MemPointer : public StackObj { } #endif - // Put the base in in the 0th summand. + // Put the base in the 0th summand. Node* base = _base.object_or_native_or_null(); int pos = 0; if (base != nullptr) { @@ -760,7 +760,7 @@ class MemPointer : public StackObj { public: bool has_same_non_base_summands_as(const MemPointer& other) const { if (!base().is_known() || !other.base().is_known()) { - assert(false, "unknonw base case is not answered optimally"); + assert(false, "unknown base case is not answered optimally"); return false; } // Known base at 0th summand: all other summands are non-base summands. diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 4e00a5f6ba78d..987eb00fc7ccc 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -480,8 +480,8 @@ AlignmentSolution* AlignmentSolver::solve() const { // // base % ObjectAlignmentInBytes = 0 ==> base % aw = 0 // - // Note: we have been assuming that this also holds for native memory base - // addresses. This is incorrect, see JDK-8323582. + // TODO: Note: we have been assuming that this also holds for native memory base + // addresses. This is incorrect, see JDK-8323582. // // 2) The "C_const" term is the sum of all constant terms. This is "con", // plus "iv_scale * init" if it is constant. @@ -522,7 +522,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // We must find a pre_iter, such that adr is aw aligned: adr % aw = 0. Note, that we are defining the // modulo operator "%" such that the remainder is always positive, see AlignmentSolution::mod(i, q). // - // Note: the following assumption is incorrect for native memory bases, see JDK-8323582. + // TODO: Note: the following assumption is incorrect for native memory bases, see JDK-8323582. // Since "base % aw = 0", we only need to ensure alignment of the other 5 terms: // // (C_const + C_invar * var_invar + C_init * var_init + C_pre * pre_iter + C_main * main_iter) % aw = 0 (1) @@ -880,7 +880,7 @@ AlignmentSolution* AlignmentSolver::solve() const { // + iv_scale * main_stride * main_iter)) % aw = // // -> base aligned: base % aw = 0 - // Note: this assumption is incorrect for native memory bases, see JDK-8323582. + // TODO: Note: this assumption is incorrect for native memory bases, see JDK-8323582. // -> main-loop iterations aligned (2): C_main % aw = (iv_scale * main_stride) % aw = 0 // (con + invar + iv_scale * init + iv_scale * pre_stride * pre_iter) % aw = // diff --git a/src/hotspot/share/opto/vtransform.cpp b/src/hotspot/share/opto/vtransform.cpp index 8ca5839775bae..4c63912f177ab 100644 --- a/src/hotspot/share/opto/vtransform.cpp +++ b/src/hotspot/share/opto/vtransform.cpp @@ -145,7 +145,7 @@ void VTransformApplyResult::trace(VTransformNode* vtnode) const { #endif // Helper-class for VTransformGraph::has_store_to_load_forwarding_failure. -// It wraps a VPointer. The VPointer have an iv_offset applied, which +// It wraps a VPointer. The VPointer has an iv_offset applied, which // simulates a virtual unrolling. They represent the memory region: // [adr, adr + size) // adr = base + invar + iv_scale * (iv + iv_offset) + con From 8f093203766b983cfb82f6403e4309dd9f458dcb Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 18 Jan 2025 06:55:53 +0100 Subject: [PATCH 128/130] for Christian --- src/hotspot/share/opto/mempointer.hpp | 9 +++++++-- src/hotspot/share/opto/superword.cpp | 7 +++++-- src/hotspot/share/opto/superword.hpp | 2 +- src/hotspot/share/opto/vectorization.cpp | 6 +++--- .../loopopts/superword/TestEquivalentInvariants.java | 4 ++-- .../compiler/loopopts/superword/TestMemorySegment.java | 8 ++++---- 6 files changed, 22 insertions(+), 14 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.hpp b/src/hotspot/share/opto/mempointer.hpp index 419d9a896491b..f1d29f2453f52 100644 --- a/src/hotspot/share/opto/mempointer.hpp +++ b/src/hotspot/share/opto/mempointer.hpp @@ -507,10 +507,10 @@ class MemPointerSummand : public StackObj { static int cmp_by_variable_idx(const MemPointerSummand& p1, const MemPointerSummand& p2) { if (p1.variable() == nullptr) { return (p2.variable() == nullptr) ? 0 : 1; - } else if (p2.variable() == nullptr) { + } + if (p2.variable() == nullptr) { return -1; } - return p1.variable()->_idx - p2.variable()->_idx; } @@ -826,6 +826,11 @@ class MemPointer : public StackObj { #endif }; +// Utility class. +// MemPointerParser::parse takes a MemNode (load or store) and computes its MemPointer. +// It temporarily allocates dynamic data structures (GrowableArray) in the resource +// area. This way, the computed MemPointer does not have to have any dynamic data +// structures and can be copied freely by value. class MemPointerParser : public StackObj { private: const MemNode* _mem; diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 1fe310eac22a0..cbae601788fde 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -95,6 +95,9 @@ class SuperWordUnrollingAnalysisIgnoredNodes : public MemPointerParserCallback { } }; +// SuperWord unrolling analysis does: +// - Determine if the loop is a candidate for auto vectorization (SuperWord). +// - Find a good unrolling factor, to ensure full vector width utilization once we vectorize. void SuperWord::unrolling_analysis(const VLoop &vloop, int &local_loop_unroll_factor) { IdealLoopTree* lpt = vloop.lpt(); CountedLoopNode* cl = vloop.cl(); @@ -556,7 +559,7 @@ void SuperWord::create_adjacent_memop_pairs() { } // Collect all memops that could potentially be vectorized. -void SuperWord::collect_valid_memops(GrowableArray& memops) { +void SuperWord::collect_valid_memops(GrowableArray& memops) const { int original_index = 0; for_each_mem([&] (MemNode* mem, int bb_idx) { const VPointer& p = vpointer(mem); @@ -2834,7 +2837,6 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { const int iv_scale = p.iv_scale(); const int con = p.con(); Node* base = p.mem_pointer().base().object_or_native(); - bool is_base_native = p.mem_pointer().base().is_native(); #ifdef ASSERT if (_trace._align_vector) { @@ -2927,6 +2929,7 @@ void VTransform::adjust_pre_loop_limit_to_align_main_loop_vectors() { }); // 1.3: base (unless base is guaranteed aw aligned) + bool is_base_native = p.mem_pointer().base().is_native(); if (aw > ObjectAlignmentInBytes || is_base_native) { // For objects, the base is ObjectAlignmentInBytes aligned. // For native memory, we simply have a long that was cast to diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index e66b57358955b..57a403b449843 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -594,7 +594,7 @@ class SuperWord : public ResourceObj { } }; void create_adjacent_memop_pairs(); - void collect_valid_memops(GrowableArray& memops); + void collect_valid_memops(GrowableArray& memops) const; void create_adjacent_memop_pairs_in_all_groups(const GrowableArray& memops); static int find_group_end(const GrowableArray& memops, int group_start); void create_adjacent_memop_pairs_in_one_group(const GrowableArray& memops, const int group_start, int group_end); diff --git a/src/hotspot/share/opto/vectorization.cpp b/src/hotspot/share/opto/vectorization.cpp index 987eb00fc7ccc..12be2f5cd092e 100644 --- a/src/hotspot/share/opto/vectorization.cpp +++ b/src/hotspot/share/opto/vectorization.cpp @@ -182,8 +182,8 @@ void VLoopVPointers::count_vpointers() { } void VLoopVPointers::allocate_vpointers_array() { - uint bytes2 = _vpointers_length * sizeof(VPointer); - _vpointers = (VPointer*)_arena->Amalloc(bytes2); + uint bytes = _vpointers_length * sizeof(VPointer); + _vpointers = (VPointer*)_arena->Amalloc(bytes); } void VLoopVPointers::compute_and_cache_vpointers() { @@ -959,7 +959,7 @@ void AlignmentSolver::trace_start_solve() const { _pre_stride, _main_stride); // adr = base + con + invar + iv_scale * iv tty->print(" adr = base[%d]", base().object_or_native()->_idx); - tty->print(" + con(%d) + invar + iv_scale(%d) * iv", _vpointer.con(), iv_scale()); + tty->print(" + invar + iv_scale(%d) * iv + con(%d)", iv_scale(), _vpointer.con()); } } diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java index 0d3dbd237078f..a9b158f1c0980 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestEquivalentInvariants.java @@ -34,7 +34,7 @@ /* * @test - * @bug 8343685 8330274 + * @bug 8343685 8331659 * @summary Test vectorization with various invariants that are equivalent, but not trivially so, * i.e. where the invariants have the same summands, but in a different order. * @modules java.base/jdk.internal.misc @@ -884,7 +884,7 @@ static Object[] testMemorySegmentLInvarL3d3(MemorySegment m, int size) { IRNode.STORE_VECTOR, "= 0"}, applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) - // FAILS: should be ok to vectorize, but does not. Investigate in JDK-8330274. + // FAILS: should be ok to vectorize, but does not. Investigate in JDK-8331659. static Object[] testMemorySegmentLInvarL3e(MemorySegment m, int invar1, int invar2, int invar3, int size) { long i1 = (long)(-invar1 + invar2 + invar3); long i2 = (long)(invar2 + invar3) - (long)(invar1); // not equivalent diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java index 38533f9d07217..aab5bcfb28b39 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestMemorySegment.java @@ -646,7 +646,7 @@ static Object[] testLongLoop_longIndex_longInvar_sameAdr_byte(MemorySegment a, l @Test // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 + // See: JDK-8331659 // Interestingly, it now vectorizes for native, but not for arrays. static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) { for (long i = 0; i < a.byteSize(); i++) { @@ -660,7 +660,7 @@ static Object[] testLongLoop_longIndex_intInvar_byte(MemorySegment a, int invar) @Test // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 + // See: JDK-8331659 // Interestingly, it now vectorizes for native, but not for arrays. static Object[] testLongLoop_longIndex_longInvar_byte(MemorySegment a, long invar) { for (long i = 0; i < a.byteSize(); i++) { @@ -747,7 +747,7 @@ static Object[] testLongLoop_longIndex_longInvar_sameAdr_int(MemorySegment a, lo applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 + // See: JDK-8331659 static Object[] testLongLoop_longIndex_intInvar_int(MemorySegment a, int invar) { for (long i = 0; i < a.byteSize()/4; i++) { long adr1 = 4L * (long)(i) + 4L * (long)(invar); @@ -765,7 +765,7 @@ static Object[] testLongLoop_longIndex_intInvar_int(MemorySegment a, int invar) applyIfPlatform = {"64-bit", "true"}, applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) // FAILS: invariants are sorted differently, because of differently inserted Cast. - // See: JDK-8330274 + // See: JDK-8331659 static Object[] testLongLoop_longIndex_longInvar_int(MemorySegment a, long invar) { for (long i = 0; i < a.byteSize()/4; i++) { long adr1 = 4L * (long)(i) + 4L * (long)(invar); From b5e4501afd4bd67ef8fef431148f6beaaa399877 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 18 Jan 2025 06:59:02 +0100 Subject: [PATCH 129/130] fix indent --- src/hotspot/share/opto/mempointer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/hotspot/share/opto/mempointer.cpp b/src/hotspot/share/opto/mempointer.cpp index a1455ace62bff..174d8ee02b387 100644 --- a/src/hotspot/share/opto/mempointer.cpp +++ b/src/hotspot/share/opto/mempointer.cpp @@ -489,9 +489,9 @@ MemPointerAliasing MemPointer::get_aliasing_with(const MemPointer& other // we computed for the MemPointers: // p_other - p_this = distance = other.con - this.con #ifndef PRODUCT - if (trace.is_trace_aliasing()) { - tty->print_cr(" -> Aliasing always at distance = %d.", distance.value()); - } + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing always at distance = %d.", distance.value()); + } #endif return MemPointerAliasing::make_always_at_distance(distance.value()); } else { @@ -502,9 +502,9 @@ MemPointerAliasing MemPointer::get_aliasing_with(const MemPointer& other // same memory object, i.e. (S1) holds. We have already proven (S0) // and (S3), so all 4 conditions for "MemPointer Lemma" are given. #ifndef PRODUCT - if (trace.is_trace_aliasing()) { - tty->print_cr(" -> Aliasing not or at distance = %d.", distance.value()); - } + if (trace.is_trace_aliasing()) { + tty->print_cr(" -> Aliasing not or at distance = %d.", distance.value()); + } #endif return MemPointerAliasing::make_not_or_at_distance(distance.value()); } From 714298b96e241b5e7e1123c6e64275670fd2a9a8 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Sat, 18 Jan 2025 17:20:09 +0100 Subject: [PATCH 130/130] fix guard --- src/hotspot/share/opto/vectorization.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectorization.hpp b/src/hotspot/share/opto/vectorization.hpp index bdb81af6509f2..cb1e8c4585675 100644 --- a/src/hotspot/share/opto/vectorization.hpp +++ b/src/hotspot/share/opto/vectorization.hpp @@ -1028,7 +1028,7 @@ class AlignmentSolution : public ResourceObj { // Implemented by each subclass virtual const AlignmentSolution* filter(const AlignmentSolution* other) const = 0; - DEBUG_ONLY( virtual void print() const = 0; ) + NOT_PRODUCT( virtual void print() const = 0; ) // Compute modulo and ensure that we get a positive remainder static int mod(int i, int q) {