Skip to content

Commit da14813

Browse files
committed
8373453: C2 SuperWord: must handle load slices that have loads with different memory inputs
Reviewed-by: kvn, thartmann, qamai
1 parent 929864b commit da14813

File tree

3 files changed

+116
-7
lines changed

3 files changed

+116
-7
lines changed

src/hotspot/share/opto/vectorization.cpp

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,10 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
187187
return body_status;
188188
}
189189

190-
_memory_slices.find_memory_slices();
190+
VStatus slices_status = _memory_slices.find_memory_slices();
191+
if (!slices_status.is_success()) {
192+
return slices_status;
193+
}
191194

192195
// If there is no memory slice detected, it means there is no store.
193196
// If there is no reduction and no store, then we give up, because
@@ -207,9 +210,11 @@ VStatus VLoopAnalyzer::setup_submodules_helper() {
207210
}
208211

209212
// There are 2 kinds of slices:
210-
// - No memory phi: only loads. All have the same input memory state from before the loop.
213+
// - No memory phi: only loads.
214+
// - Usually, all loads have the same input memory state from before the loop.
215+
// - Only rarely this is not the case, and we just bail out for now.
211216
// - With memory phi. Chain of memory operations inside the loop.
212-
void VLoopMemorySlices::find_memory_slices() {
217+
VStatus VLoopMemorySlices::find_memory_slices() {
213218
Compile* C = _vloop.phase()->C;
214219
// We iterate over the body, which is topologically sorted. Hence, if there is a phi
215220
// in a slice, we will find it first, and the loads and stores afterwards.
@@ -228,8 +233,15 @@ void VLoopMemorySlices::find_memory_slices() {
228233
PhiNode* head = _heads.at(alias_idx);
229234
if (head == nullptr) {
230235
// We did not find a phi on this slice yet -> must be a slice with only loads.
231-
assert(_inputs.at(alias_idx) == nullptr || _inputs.at(alias_idx) == load->in(1),
232-
"not yet touched or the same input");
236+
// For now, we can only handle slices with a single memory input before the loop,
237+
// so if we find multiple, we bail out of auto vectorization. If this becomes
238+
// too restrictive in the fututure, we could consider tracking multiple inputs.
239+
// Different memory inputs can for example happen if one load has its memory state
240+
// optimized, and the other load fails to have it optimized, for example because
241+
// it does not end up on the IGVN worklist any more.
242+
if (_inputs.at(alias_idx) != nullptr && _inputs.at(alias_idx) != load->in(1)) {
243+
return VStatus::make_failure(FAILURE_DIFFERENT_MEMORY_INPUT);
244+
}
233245
_inputs.at_put(alias_idx, load->in(1));
234246
} // else: the load belongs to a slice with a phi that already set heads and inputs.
235247
#ifdef ASSERT
@@ -243,6 +255,7 @@ void VLoopMemorySlices::find_memory_slices() {
243255
}
244256
}
245257
NOT_PRODUCT( if (_vloop.is_trace_memory_slices()) { print(); } )
258+
return VStatus::make_success();
246259
}
247260

248261
#ifndef PRODUCT

src/hotspot/share/opto/vectorization.hpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023, 2025, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2023, 2026, Oracle and/or its affiliates. All rights reserved.
33
* Copyright (c) 2023, Arm Limited. All rights reserved.
44
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
55
*
@@ -504,6 +504,8 @@ class VLoopBody : public StackObj {
504504
//
505505
class VLoopMemorySlices : public StackObj {
506506
private:
507+
static constexpr char const* FAILURE_DIFFERENT_MEMORY_INPUT = "Load only slice has multiple memory inputs";
508+
507509
const VLoop& _vloop;
508510
const VLoopBody& _body;
509511

@@ -521,7 +523,7 @@ class VLoopMemorySlices : public StackObj {
521523
const GrowableArray<Node*>& inputs() const { return _inputs; }
522524
const GrowableArray<PhiNode*>& heads() const { return _heads; }
523525

524-
void find_memory_slices();
526+
VStatus find_memory_slices();
525527
void get_slice_in_reverse_order(PhiNode* head, MemNode* tail, GrowableArray<MemNode*>& slice) const;
526528
bool same_memory_slice(MemNode* m1, MemNode* m2) const;
527529

Lines changed: 94 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,94 @@
1+
/*
2+
* Copyright (c) 2026, Oracle and/or its affiliates. All rights reserved.
3+
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4+
*
5+
* This code is free software; you can redistribute it and/or modify it
6+
* under the terms of the GNU General Public License version 2 only, as
7+
* published by the Free Software Foundation.
8+
*
9+
* This code is distributed in the hope that it will be useful, but WITHOUT
10+
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11+
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12+
* version 2 for more details (a copy is included in the LICENSE file that
13+
* accompanied this code).
14+
*
15+
* You should have received a copy of the GNU General Public License version
16+
* 2 along with this work; if not, write to the Free Software Foundation,
17+
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18+
*
19+
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20+
* or visit www.oracle.com if you need additional information or have any
21+
* questions.
22+
*/
23+
24+
/*
25+
* @test id=all-flags
26+
* @summary Test a case where we can have one memory slice that has only loads,
27+
* but the loads from the slice do not have all the same input memory
28+
* state from before the loop. This is rather rare but it can happen.
29+
* @bug 8373453
30+
* @run main/othervm
31+
* -XX:CompileCommand=compileonly,${test.main.class}::test
32+
* -Xbatch -XX:-TieredCompilation
33+
* ${test.main.class}
34+
*/
35+
36+
/*
37+
* @test id=fewer-flags
38+
* @bug 8373453
39+
* @run main/othervm
40+
* -XX:CompileCommand=compileonly,${test.main.class}::test
41+
* ${test.main.class}
42+
*/
43+
44+
/*
45+
* @test id=vanilla
46+
* @bug 8373453
47+
* @run main ${test.main.class}
48+
*/
49+
50+
package compiler.loopopts.superword;
51+
52+
public class TestLoadSliceWithMultipleMemoryInputStates {
53+
static void test() {
54+
// The relevant slice is the value field of the Byte Objects.
55+
Byte x = 1;
56+
57+
for (int i = 0; i < 2; i++) {
58+
if ((i & 1) == 0) {
59+
// Not sure what this loop is needed for, but it is very sensitive,
60+
// I cannot even replace N with 32.
61+
int N = 32;
62+
for (int j = 0; j < N; j++) {
63+
if (j == 1) {
64+
x = (byte) x;
65+
}
66+
}
67+
68+
for (int j = 0; j < 32; j++) {
69+
// The call below has an effect on the memory state
70+
// If we optimize the Load for Byte::value, we can bypass
71+
// this call, since we know that Byte::value cannot be
72+
// modified during the call.
73+
Object o = 1;
74+
o.toString();
75+
76+
for (int k = 0; k < 32; k++) { // OSR around here
77+
// Loads of x byte field have different memory input states
78+
// This is because some loads can split their memory state
79+
// through a phi further up, and others are not put back on
80+
// the IGVN worklist and are thus not optimized and keep
81+
// the old memory state. Both are correct though.
82+
x = (byte) (x + 1);
83+
}
84+
}
85+
}
86+
}
87+
}
88+
89+
public static void main(String[] args) {
90+
for (int i = 0; i < 10_000; i++) {
91+
test();
92+
}
93+
}
94+
}

0 commit comments

Comments
 (0)