Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 11 additions & 54 deletions src/hotspot/cpu/aarch64/aarch64_vector.ad
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2024, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
Expand Down Expand Up @@ -204,12 +204,6 @@ source %{
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
if (vlen < 4) {
return false;
}
break;
case Op_ExpandV:
if (UseSVE < 2 || is_subword_type(bt)) {
return false;
Expand Down Expand Up @@ -6156,61 +6150,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3].
// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1].
// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1].
// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404]
// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100]
// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
// 6. Use Vm as index register, and use V1 as table register.
// Then get V2 as the result by tbl NEON instructions.
// Notes:
// Step 1 matches VectorLoadConst.
// Step 3 matches VectorLoadShuffle.
// Step 4, 5, 6 match VectorRearrange.
// For VectorRearrange short/int, the reason why such complex calculation is
// required is because NEON tbl supports bytes table only, so for short/int, we
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
(type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 &&
Matcher::vector_length_in_bytes(n) == 16)));
instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
match(Set dst (VectorRearrange src shuffle));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %}
effect(TEMP_DEF dst, TEMP tmp);
format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
if (bt == T_SHORT) {
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B;
Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H;
__ mov($tmp1$$FloatRegister, size1, 0x02);
__ mov($tmp2$$FloatRegister, size2, 0x0100);
__ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
__ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister);
__ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister);
} else {
assert(bt == T_INT || bt == T_FLOAT, "unsupported type");
__ mov($tmp1$$FloatRegister, __ T16B, 0x04);
__ mov($tmp2$$FloatRegister, __ T4S, 0x03020100);
__ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
__ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister);
__ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister);
}
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
__ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister,
$shuffle$$FloatRegister, $tmp$$FloatRegister,
bt, length_in_bytes == 16);
%}
ins_pipe(pipe_slow);
%}

instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0);
predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE);
match(Set dst (VectorRearrange src shuffle));
format %{ "rearrange $dst, $src, $shuffle" %}
ins_encode %{
Expand Down
65 changes: 11 additions & 54 deletions src/hotspot/cpu/aarch64/aarch64_vector_ad.m4
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
//
// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2020, 2024, Arm Limited. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
Expand Down Expand Up @@ -194,12 +194,6 @@ source %{
return false;
}
break;
case Op_VectorLoadShuffle:
case Op_VectorRearrange:
if (vlen < 4) {
return false;
}
break;
case Op_ExpandV:
if (UseSVE < 2 || is_subword_type(bt)) {
return false;
Expand Down Expand Up @@ -4403,61 +4397,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{

// ------------------------------ Vector rearrange -----------------------------

// Here is an example that rearranges a NEON vector with 4 ints:
// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3].
// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1].
// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1].
// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404]
// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100]
// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
// 6. Use Vm as index register, and use V1 as table register.
// Then get V2 as the result by tbl NEON instructions.
// Notes:
// Step 1 matches VectorLoadConst.
// Step 3 matches VectorLoadShuffle.
// Step 4, 5, 6 match VectorRearrange.
// For VectorRearrange short/int, the reason why such complex calculation is
// required is because NEON tbl supports bytes table only, so for short/int, we
// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl
// to implement rearrange.

// Maybe move the shuffle preparation to VectorLoadShuffle
instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{
predicate(UseSVE == 0 &&
(Matcher::vector_element_basic_type(n) == T_SHORT ||
(type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 &&
Matcher::vector_length_in_bytes(n) == 16)));
instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{
predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE);
match(Set dst (VectorRearrange src shuffle));
effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2);
format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %}
effect(TEMP_DEF dst, TEMP tmp);
format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %}
ins_encode %{
BasicType bt = Matcher::vector_element_basic_type(this);
if (bt == T_SHORT) {
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B;
Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H;
__ mov($tmp1$$FloatRegister, size1, 0x02);
__ mov($tmp2$$FloatRegister, size2, 0x0100);
__ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
__ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister);
__ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister);
} else {
assert(bt == T_INT || bt == T_FLOAT, "unsupported type");
__ mov($tmp1$$FloatRegister, __ T16B, 0x04);
__ mov($tmp2$$FloatRegister, __ T4S, 0x03020100);
__ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister);
__ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister);
__ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister);
}
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
assert(length_in_bytes == 8 || length_in_bytes == 16, "must be");
__ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister,
$shuffle$$FloatRegister, $tmp$$FloatRegister,
bt, length_in_bytes == 16);
%}
ins_pipe(pipe_slow);
%}

instruct rearrange(vReg dst, vReg src, vReg shuffle) %{
predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0);
predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE);
match(Set dst (VectorRearrange src shuffle));
format %{ "rearrange $dst, $src, $shuffle" %}
ins_encode %{
Expand Down
58 changes: 58 additions & 0 deletions src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2545,6 +2545,64 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src,
}
}

// VectorRearrange implementation for short/int/float/long/double types with NEON
// instructions. For VectorRearrange short/int/float, we use NEON tbl instruction.
// But since it supports bytes table only, we need to lookup 2/4 bytes as a group.
// For VectorRearrange long/double, we compare the shuffle input with iota indices,
// and use bsl to implement the operation.
void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src,
FloatRegister shuffle, FloatRegister tmp,
BasicType bt, bool isQ) {
assert_different_registers(dst, src, shuffle, tmp);
SIMD_Arrangement size1 = isQ ? T16B : T8B;
SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ);

// Here is an example that rearranges a NEON vector with 4 ints:
// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1]
// 1. We assume the shuffle input is Vi int[2, 3, 0, 1].
// 2. Multiply Vi int[2, 3, 0, 1] with constant int vector
// [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get
// tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404].
// 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100],
// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504]
// 4. Use Vm as index register, and use V1 as table register.
// Then get V2 as the result by tbl NEON instructions.
switch (bt) {
case T_SHORT:
mov(tmp, size1, 0x02);
mulv(dst, size2, shuffle, tmp);
mov(tmp, size2, 0x0100);
addv(dst, size1, dst, tmp);
tbl(dst, size1, src, 1, dst);
break;
case T_INT:
case T_FLOAT:
mov(tmp, size1, 0x04);
mulv(dst, size2, shuffle, tmp);
mov(tmp, size2, 0x03020100);
addv(dst, size1, dst, tmp);
tbl(dst, size1, src, 1, dst);
break;
case T_LONG:
case T_DOUBLE:
// Load the iota indices for Long type. The indices are ordered by
// type B/S/I/L/F/D, and the offset between two types is 16; Hence
// the offset for L is 48.
lea(rscratch1,
Copy link
Contributor

@Bhavana-Kilambi Bhavana-Kilambi Mar 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @XiaohongGong , thanks for adding support for 2D/2L as well. I was trying to implement the same for the two vector table and I am wondering what you think of this implementation -

negr(dst, shuffle); // this would help create a mask. If input is 1, it would be all 1s and all 0s if its 0
dup(tmp1, src1, 0); // duplicate first element of src1
dup(tmp2, src1, 1); // duplicate second element of src1
bsl(dst, T16B, tmp2, tmp1); // Select from tmp2 if dst is 1 and from tmp1 if dst is 0 

I am really not sure which implementation would be faster though. This implementation might take around 8 cycles.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sounds good to me. I will try with this solution and compare the performance on my Grace CPU. Thanks for this advice!

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @Bhavana-Kilambi , I'v finished the test with what you suggested on my Grace CPU. The vectorapi jtreg all pass. So this solution works well. But the performance seems no obvious change compared with the current PR's codegen as expected.

Here is the performance data:

Benchmark                                     (size)  Mode  Cnt   Current   Bahavana's  Units   Gain
Double128Vector.rearrange                      1024  thrpt   30    591.504    588.616   ops/ms  0.995
Long128Vector.rearrange                        1024  thrpt   30    593.348    590.802   ops/ms  0.995
SelectFromBenchmark.rearrangeFromByteVector    1024  thrpt   30  16576.713  16664.580   ops/ms  1.005
SelectFromBenchmark.rearrangeFromByteVector    2048  thrpt   30   8358.694   8392.733   ops/ms  1.004
SelectFromBenchmark.rearrangeFromDoubleVector  1024  thrpt   30   1312.752   1213.538   ops/ms  0.924
SelectFromBenchmark.rearrangeFromDoubleVector  2048  thrpt   30    657.365    607.060   ops/ms  0.923
SelectFromBenchmark.rearrangeFromFloatVector   1024  thrpt   30   1905.595   1911.831   ops/ms  1.003
SelectFromBenchmark.rearrangeFromFloatVector   2048  thrpt   30    952.205    957.160   ops/ms  1.005
SelectFromBenchmark.rearrangeFromIntVector     1024  thrpt   30   2106.763   2107.238   ops/ms  1.000
SelectFromBenchmark.rearrangeFromIntVector     2048  thrpt   30   1056.299   1056.769   ops/ms  1.000
SelectFromBenchmark.rearrangeFromLongVector    1024  thrpt   30   1462.355   1247.853   ops/ms  0.853
SelectFromBenchmark.rearrangeFromLongVector    2048  thrpt   30    732.559    616.753   ops/ms  0.841
SelectFromBenchmark.rearrangeFromShortVector   1024  thrpt   30   4560.253   4559.861   ops/ms  0.999
SelectFromBenchmark.rearrangeFromShortVector   2048  thrpt   30   2279.058   2279.693   ops/ms  1.000
VectorXXH3HashingBenchmark.hashingKernel       1024  thrpt   30   1080.589   1073.883   ops/ms  0.993
VectorXXH3HashingBenchmark.hashingKernel       2048  thrpt   30    541.629    537.288   ops/ms  0.991
VectorXXH3HashingBenchmark.hashingKernel       4096  thrpt   30    269.886    268.460   ops/ms  0.994
VectorXXH3HashingBenchmark.hashingKernel       8192  thrpt   30    135.193    134.175   ops/ms  0.992

I expected it will have obvious improvement since we do not need the heavy ldr instruction. But I also got the similar performance data on an AArch64 n1 machine. One shortage of your suggestion I can see is it needs one more temp vector register. To be honest, I'm not sure which one is better. Maybe we need more performance data on different kinds of AArch64 machines. So, would you mind testing the performance on other AArch64 machines with NEON? Thanks a lot!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @XiaohongGong , thanks for testing this variation. I also expected it to have relatively better performance due to the absence of the load instruction. Maybe it might help in larger real-world workload where reducing some load instructions or having fewer instructions can help performance (by reducing pressure on icache/iTLB).
Thinking of aarch64 Neon machines that we can test this on - we have only N1, V2 (Grace) machines which have support for 128-bit Neon. V1 is 256 bit Neon/SVE which will execute the sve tbl instruction instead. I can of course disable SVE and run the Neon instructions on V1 but I don't think that would really make any difference. So for 128-bit Neon machines, I can also test only on N1 and V2 which you've already done. Do you have a specific machine in mind that you'd like this to be tested on?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for your clarify @Bhavana-Kilambi . I agree with you that it may not make any difference on other machines. So do you suggest that I change the pattern right now, or revisit this part once we met the performance issue on other real-world workload?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, I am fine with going ahead with the current implementation and revisit if we encounter any performance issues. Thanks for testing.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have all the vectorAPI JTREG tests been tested on N1 and Grace?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Have all the vectorAPI JTREG tests been tested on N1 and Grace?

Yes, of cause. I tested all vector API relative jtreg tests both with NEON and SVE.

ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48));
ldrq(tmp, rscratch1);
// Check whether the input "shuffle" is the same with iota indices.
// Return "src" if true, otherwise swap the two elements of "src".
cm(EQ, dst, size2, shuffle, tmp);
ext(tmp, size1, src, src, 8);
bsl(dst, size1, src, tmp);
break;
default:
assert(false, "unsupported element type");
ShouldNotReachHere();
}
}

// Extract a scalar element from an sve vector at position 'idx'.
// The input elements in src are expected to be of integral type.
void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src,
Expand Down
4 changes: 3 additions & 1 deletion src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -179,6 +179,8 @@

void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ);

void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle,
FloatRegister tmp, BasicType bt, bool isQ);
// java.lang.Math::signum intrinsics
void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero,
FloatRegister one, SIMD_Arrangement T);
Expand Down
30 changes: 30 additions & 0 deletions test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,36 @@ public class IRNode {
beforeMatchingNameRegex(OPAQUE_MULTIVERSIONING, "OpaqueMultiversioning");
}

public static final String REARRANGE_VB = VECTOR_PREFIX + "REARRANGE_VB" + POSTFIX;
static {
vectorNode(REARRANGE_VB, "VectorRearrange", TYPE_BYTE);
}

public static final String REARRANGE_VS = VECTOR_PREFIX + "REARRANGE_VS" + POSTFIX;
static {
vectorNode(REARRANGE_VS, "VectorRearrange", TYPE_SHORT);
}

public static final String REARRANGE_VI = VECTOR_PREFIX + "REARRANGE_VI" + POSTFIX;
static {
vectorNode(REARRANGE_VI, "VectorRearrange", TYPE_INT);
}

public static final String REARRANGE_VL = VECTOR_PREFIX + "REARRANGE_VL" + POSTFIX;
static {
vectorNode(REARRANGE_VL, "VectorRearrange", TYPE_LONG);
}

public static final String REARRANGE_VF = VECTOR_PREFIX + "REARRANGE_VF" + POSTFIX;
static {
vectorNode(REARRANGE_VF, "VectorRearrange", TYPE_FLOAT);
}

public static final String REARRANGE_VD = VECTOR_PREFIX + "REARRANGE_VD" + POSTFIX;
static {
vectorNode(REARRANGE_VD, "VectorRearrange", TYPE_DOUBLE);
}

public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX;
static {
String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END;
Expand Down
Loading