Skip to content
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
3dd72b8
8307513: C2: intrinsify Math.max(long,long) and Math.min(long,long)
galderz Jul 8, 2024
e43b390
Add IR test
galderz Jul 18, 2024
f910739
Refactor inline methods to unify their implementations
galderz Jul 19, 2024
ce71a0e
Add math vectorized JMH benchmark
galderz Jul 23, 2024
8d66f7b
Rename benchmark class to MathLoopBench
galderz Aug 27, 2024
605a78a
Fix multi long tests to use long arrays
galderz Aug 27, 2024
1522e26
Implement cmovL as a jump+mov branch
galderz Sep 9, 2024
a64fcda
Switch movl to movq
galderz Sep 11, 2024
13ed872
Fix format of assembly for the movl to movq switch
galderz Sep 11, 2024
da720c5
Distribute values targetting a branch percentage
galderz Sep 12, 2024
0b71cb5
Fix min case to distribute numbers as per probability
galderz Sep 12, 2024
fe3aff4
Fix compilation error
galderz Sep 12, 2024
0047a4b
Add an intermediate % that is more representative of real life
galderz Sep 12, 2024
f622852
Skip single array benchmarks
galderz Sep 16, 2024
6fd8805
Add min/max benchmark that includes loops and reductions
galderz Sep 24, 2024
93799d5
Renamed benchmark methods
galderz Sep 24, 2024
c06e869
Multiply array value in reduction for vectorization to kick in
galderz Sep 25, 2024
28778c8
Remove previous benchmark effort
galderz Sep 27, 2024
bc648aa
Revert "Fix format of assembly for the movl to movq switch"
galderz Sep 27, 2024
7a07aa8
Revert "Switch movl to movq"
galderz Sep 27, 2024
16ae2a3
Revert "Implement cmovL as a jump+mov branch"
galderz Sep 27, 2024
3f712e2
Merge branch 'master' into topic.intrinsify-max-min-long
galderz Oct 17, 2024
6cc5484
Avoid creating result array in benchmark method
galderz Oct 9, 2024
c956012
Encapsulate benchmark state within an inner class
galderz Oct 10, 2024
0b19789
Add clipping range benchmark that uses min/max
galderz Oct 10, 2024
e669893
Restore previous benchmark iterations and default param size
galderz Oct 10, 2024
dcf6b54
Make state class non-final
galderz Oct 10, 2024
b19fc81
Double/Float tests only when avx enabled
galderz Oct 15, 2024
f6f0244
Renamed benchmark class
galderz Oct 17, 2024
0a8718e
Use same default size as in other vector reduction benchmarks
galderz Oct 17, 2024
aca0922
Merge branch 'master' into topic.intrinsify-max-min-long
galderz Dec 12, 2024
65e2e48
Add empty line
galderz Dec 17, 2024
c964c26
Add max reduction test
galderz Dec 17, 2024
cfe0239
Fix style
galderz Dec 17, 2024
7353a07
Adjust min/max identity IR test expectations after changes
galderz Dec 17, 2024
130b475
Added comment around the assertions
galderz Dec 17, 2024
4d4753f
Tests should also run on aarch64 asimd=true envs
galderz Dec 18, 2024
fb0f731
Fix license header
galderz Dec 18, 2024
c049198
Test can only run with 256 bit registers or bigger
galderz Jan 9, 2025
abbaf87
Make sure it runs with cpus with either avx512 or asimd
galderz Jan 13, 2025
94397d3
Fix copyright years
galderz Jan 17, 2025
f83d886
Renaming methods and variables and add docu on algorithms
galderz Jan 17, 2025
724a346
Fix typo
galderz Jan 17, 2025
a190ae6
Merge branch 'master' into topic.intrinsify-max-min-long
galderz Feb 7, 2025
d0e793a
Add simple reduction benchmarks on top of multiply ones
galderz Feb 17, 2025
38537fc
Add assertion comments
galderz Mar 7, 2025
1aa690d
Merge branch 'master' into topic.intrinsify-max-min-long
galderz Mar 7, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/hotspot/share/opto/library_call.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1943,9 +1943,9 @@ bool LibraryCallKit::inline_notify(vmIntrinsics::ID id) {

//----------------------------inline_min_max-----------------------------------
bool LibraryCallKit::inline_min_max(vmIntrinsics::ID id) {
Node *a = nullptr;
Node *b = nullptr;
Node *n = nullptr;
Node* a = nullptr;
Node* b = nullptr;
Node* n = nullptr;
switch (id) {
case vmIntrinsics::_min:
case vmIntrinsics::_max:
Expand Down
10 changes: 6 additions & 4 deletions test/hotspot/jtreg/compiler/c2/irTests/TestMinMaxIdentities.java
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,11 @@ public int intMaxMax(int a, int b) {

// Longs

// As Math.min/max(LL) is not intrinsified, it first needs to be transformed into CMoveL and then MinL/MaxL before
// As Math.min/max(LL) is not intrinsified in the backend, it first needs to be transformed into CMoveL and then MinL/MaxL before
// the identity can be matched. However, the outer min/max is not transformed into CMove because of the CMove cost model.
// As JDK-8307513 adds intrinsics for the methods, the tests will be updated then.
// JDK-8307513 adds intrinsics for the methods such that MinL/MaxL replace the ternary operations,
// and this enables identities to be matched.
// Note that before JDK-8307513 MinL/MaxL nodes were already present before macro expansion.

@Test
@IR(applyIfPlatform = { "riscv64", "false" }, phase = { CompilePhase.BEFORE_MACRO_EXPANSION }, counts = { IRNode.MIN_L, "1" })
Expand All @@ -123,13 +125,13 @@ public long longMinMin(long a, long b) {
}

@Test
@IR(applyIfPlatform = { "riscv64", "false" }, phase = { CompilePhase.BEFORE_MACRO_EXPANSION }, counts = { IRNode.MIN_L, "1" })
@IR(failOn = { IRNode.MIN_L, IRNode.MAX_L })
public long longMinMax(long a, long b) {
return Math.min(a, Math.max(a, b));
}

@Test
@IR(applyIfPlatform = { "riscv64", "false" }, phase = { CompilePhase.BEFORE_MACRO_EXPANSION }, counts = { IRNode.MAX_L, "1" })
@IR(failOn = { IRNode.MIN_L, IRNode.MAX_L })
public long longMaxMin(long a, long b) {
return Math.max(a, Math.min(a, b));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,10 @@ public static void checkTestIntMax(int result) {
}
}

// JDK-8307513 does not changes the way MinL/MaxL nodes intrinsified in backend.
// So they are still transformed into CmpL + CMoveL nodes after macro expansion.
// This is the reason for the different before/after macro expansion assertions below.

@Test
@Arguments(values = { Argument.NUMBER_MINUS_42, Argument.NUMBER_42 })
@IR(phase = { CompilePhase.BEFORE_MACRO_EXPANSION }, counts = { IRNode.MIN_L, "1" })
Expand Down
155 changes: 155 additions & 0 deletions test/hotspot/jtreg/compiler/loopopts/superword/MinMaxRed_Long.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,155 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2024, Oracle and/or its affiliates. All rights reserved.

* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

/**
* @test
* @bug 8307513
* @summary [SuperWord] MaxReduction and MinReduction should vectorize for long
* @library /test/lib /
* @run driver compiler.loopopts.superword.MinMaxRed_Long
*/

package compiler.loopopts.superword;

import compiler.lib.ir_framework.*;
import jdk.test.lib.Utils;

import java.util.Arrays;
import java.util.Random;
import java.util.stream.LongStream;

public class MinMaxRed_Long {

private static final Random random = Utils.getRandomInstance();

public static void main(String[] args) throws Exception {
TestFramework framework = new TestFramework();
framework.addFlags("-XX:+IgnoreUnrecognizedVMOptions",
"-XX:LoopUnrollLimit=250",
"-XX:CompileThresholdScaling=0.1");
framework.start();
}

@Run(test = {"maxReductionImplement"},
mode = RunMode.STANDALONE)
public void runMaxTest() {
runMaxTest(50);
runMaxTest(80);
runMaxTest(100);
}

private static void runMaxTest(int probability) {
long[] longs = new long[1024];
ReductionInit(longs, probability);
long res = 0;
for (int j = 0; j < 2000; j++) {
res = maxReductionImplement(longs, res);
}
if (res == 11 * Arrays.stream(longs).max().getAsLong()) {
System.out.println("Success");
} else {
throw new AssertionError("Failed");
}
}

@Run(test = {"minReductionImplement"},
mode = RunMode.STANDALONE)
public void runMinTest() {
runMinTest(50);
runMinTest(80);
runMinTest(100);
}

private static void runMinTest(int probability) {
long[] longs = new long[1024];
ReductionInit(longs, probability);
// Negating the values generated for controlling max branching
// allows same logic to be used for min tests.
longs = negate(longs);
long res = 0;
for (int j = 0; j < 2000; j++) {
res = minReductionImplement(longs, res);
}
if (res == 11 * Arrays.stream(longs).min().getAsLong()) {
System.out.println("Success");
} else {
throw new AssertionError("Failed");
}
}

static long[] negate(long[] nums) {
return LongStream.of(nums).map(l -> -l).toArray();
}

public static void ReductionInit(long[] longs, int probability) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
public static void ReductionInit(long[] longs, int probability) {
public static void reductionInit(long[] longs, int probability) {

This is a method name, not a class - so I think it should start lower-case, right?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

And the method might as well allocate the array too. But up to you.

int aboveCount, abovePercent;

// Iterate until you find a set that matches the requirement probability
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you give a high-level definition / explanation what this does?
Also: what is the expected number of rounds you iterate here? I'm asking because I would like to be sure that a timeout is basically impossible because the probability is too low.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure I'll add. It's an approximation to make it run fast as sizes increase. In the worst case I've seen it take 15 rounds when size was 100, 50% probability and got 50 below max and 50 above. But with bigger array sizes, say 10'000, and 50% probability aim, it can take 1 or 2 rounds ending up with 5027 above max, 4973 below max.

do {
long max = random.nextLong(10);
longs[0] = max;

aboveCount = 0;
for (int i = 1; i < longs.length; i++) {
long value;
if (random.nextLong(101) <= probability) {
long increment = random.nextLong(10);
value = max + increment;
aboveCount++;
} else {
// Decrement by at least 1
long decrement = random.nextLong(10) + 1;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: I would call it diffToMax, because you are really just going to get a value below the max, and you are not decrementing the max. But up to you if you want to change it.

value = max - decrement;
}
longs[i] = value;
max = Math.max(max, value);
}

abovePercent = ((aboveCount + 1) * 100) / longs.length;
} while (abovePercent != probability);
}

@Test
@IR(applyIf = {"SuperWordReductions", "true"},
applyIfCPUFeatureOr = { "avx512", "true" },
counts = {IRNode.MIN_REDUCTION_V, " > 0"})
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@eme64 I've addressed all your comments except aarch64 testing. asimd is not enough, you need sve for this, but I'm yet to make it work even with sve, something's up and need to debug it further.

Hi @galderz , may I ask if these long-reduction cases can't work even with sve? It might be related with the limitation here. Some sve machines have only 128 bits.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's right. Neoverse V2 is 4 pipes of 128 bits, V1 is 2 pipes of 256 bits.
That comment is "interesting". Maybe it should be tunable by the back end. Given that Neoverse V2 can issue 4 SVE operations per clock cycle, it might still be a win.

Galder, how about you disable that line and give it another try?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

FYI: I'm working on removing the line here.

The issue is that on some platforms 2-element vectors are somehow really slower, and we need a cost-model to give us a better heuristic, rather than the hard "no". See my draft #20964.

But yes: why don't you remove the line, and see if that makes it work. If so, then don't worry about this case for now, and maybe leave a comment in the test. We can then fix that later.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, this limit limits reductions like this working on 128 bit registers:

      // Length 2 reductions of INT/LONG do not offer performance benefits
      if (((arith_type->basic_type() == T_INT) || (arith_type->basic_type() == T_LONG)) && (size == 2)) {
        retValue = false;

I've tried today to remove that but then the profitable checks fail to pass. So, I'm not going down that route now.

public static long minReductionImplement(long[] a, long res) {
for (int i = 0; i < a.length; i++) {
final long v = 11 * a[i];
res = Math.min(res, v);
}
return res;
}

@Test
@IR(applyIf = {"SuperWordReductions", "true"},
applyIfCPUFeatureOr = { "avx512", "true" },
counts = {IRNode.MAX_REDUCTION_V, " > 0"})
public static long maxReductionImplement(long[] a, long res) {
for (int i = 0; i < a.length; i++) {
final long v = 11 * a[i];
res = Math.max(res, v);
}
return res;
}
}
1 change: 1 addition & 0 deletions test/micro/org/openjdk/bench/java/lang/MinMaxVector.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public void setup() {
resultIntArray = new int[size];
resultLongArray = new long[size];
}

static long[] negate(long[] nums) {
return LongStream.of(nums).map(l -> -l).toArray();
}
Expand Down