From 8934fae68ed58e0af126a53ec35c4b1a810ceec7 Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Thu, 20 Feb 2025 02:28:52 +0000 Subject: [PATCH 1/4] 8350463: AArch64: Add vector rearrange support for small lane count vectors The AArch64 vector rearrange implementation currently lacks support for vector types with lane counts < 4 (see [1]). This limitation results in significant performance gaps when running Long/Double vector benchmarks on NVIDIA Grace (SVE2 architecture with 128-bit vectors) compared to other SVE and x86 platforms. Vector rearrange operations depend on vector shuffle inputs, which used byte array as payload previously. The minimum vector lane count of 4 for byte type on AArch64 imposed this limitation on rearrange operations. However, vector shuffle payload has been updated to use vector-specific data types (e.g., `int` for `IntVector`) (see [2]). This change enables us to remove the lane count restriction for vector rearrange operations. This patch added the rearrange support for vector types with small lane count. Here are the main changes: - Added AArch64 match rule support for `VectorRearrange` with smaller lane counts (e.g., `2D/2S`) - Relocated NEON implementation from ad file to c2 macro assembler file for better handling of complex implementation - Optimized temporary register usage in NEON implementation for short/int/float types from two registers to one Following is the performance improvement data of several Vector API JMH benchmarks, on a NVIDIA Grace CPU with NEON and SVE. Performance of the same JMH with other vector types remains unchanged. 1) NEON JMH on panama-vector:vectorIntrinsics: ``` Benchmark (size) Mode Cnt Units Before After Gain Double128Vector.rearrange 1024 thrpt 30 ops/ms 78.060 578.859 7.42x Double128Vector.sliceUnary 1024 thrpt 30 ops/ms 72.332 1811.664 25.05x Double128Vector.unsliceUnary 1024 thrpt 30 ops/ms 72.256 1812.344 25.08x Float64Vector.rearrange 1024 thrpt 30 ops/ms 77.879 558.797 7.18x Float64Vector.sliceUnary 1024 thrpt 30 ops/ms 70.528 1981.304 28.09x Float64Vector.unsliceUnary 1024 thrpt 30 ops/ms 71.735 1994.168 27.79x Int64Vector.rearrange 1024 thrpt 30 ops/ms 76.374 562.106 7.36x Int64Vector.sliceUnary 1024 thrpt 30 ops/ms 71.680 1190.127 16.60x Int64Vector.unsliceUnary 1024 thrpt 30 ops/ms 71.895 1185.094 16.48x Long128Vector.rearrange 1024 thrpt 30 ops/ms 78.902 579.250 7.34x Long128Vector.sliceUnary 1024 thrpt 30 ops/ms 72.389 747.794 10.33x Long128Vector.unsliceUnary 1024 thrpt 30 ops/ms 71.999 747.848 10.38x ``` JMH on jdk mainline: ``` Benchmark (SIZE) Mode Cnt Units Before After Gain SelectFromBenchmark.rearrangeFromDoubleVector 1024 thrpt 30 ops/ms 44.593 1319.977 29.63x SelectFromBenchmark.rearrangeFromDoubleVector 2048 thrpt 30 ops/ms 22.318 660.061 29.58x SelectFromBenchmark.rearrangeFromLongVector 1024 thrpt 30 ops/ms 45.823 1458.144 31.82x SelectFromBenchmark.rearrangeFromLongVector 2048 thrpt 30 ops/ms 23.050 729.881 31.67x VectorXXH3HashingBenchmark.hashingKernel 1024 thrpt 30 ops/ms 97.210 1082.884 11.14x VectorXXH3HashingBenchmark.hashingKernel 2048 thrpt 30 ops/ms 48.642 541.341 11.13x VectorXXH3HashingBenchmark.hashingKernel 4096 thrpt 30 ops/ms 24.285 270.419 11.14x VectorXXH3HashingBenchmark.hashingKernel 8192 thrpt 30 ops/ms 12.421 135.115 10.88x ``` 2) SVE JMH on panama-vector:vectorIntrinsics: ``` Benchmark (size) Mode Cnt Units Before After Gain Double128Vector.rearrange 1024 thrpt 30 ops/ms 78.396 577.744 7.37x Double128Vector.sliceUnary 1024 thrpt 30 ops/ms 72.119 2538.261 35.19x Double128Vector.unsliceUnary 1024 thrpt 30 ops/ms 72.992 2536.972 34.75x Float64Vector.rearrange 1024 thrpt 30 ops/ms 77.400 561.934 7.26x Float64Vector.sliceUnary 1024 thrpt 30 ops/ms 70.858 2949.076 41.61x Float64Vector.unsliceUnary 1024 thrpt 30 ops/ms 70.654 2954.273 41.81x Int64Vector.rearrange 1024 thrpt 30 ops/ms 77.851 563.969 7.24x Int64Vector.sliceUnary 1024 thrpt 30 ops/ms 67.433 1510.484 22.39x Int64Vector.unsliceUnary 1024 thrpt 30 ops/ms 66.614 1511.617 22.69x Long128Vector.rearrange 1024 thrpt 30 ops/ms 77.637 579.021 7.46x Long128Vector.sliceUnary 1024 thrpt 30 ops/ms 69.886 1274.331 18.23x Long128Vector.unsliceUnary 1024 thrpt 30 ops/ms 70.069 1273.787 18.17x ``` JMH on jdk mainline: ``` Benchmark (SIZE) Mode Cnt Units Before After Gain SelectFromBenchmark.rearrangeFromDoubleVector 1024 thrpt 30 ops/ms 44.612 1351.850 30.30x SelectFromBenchmark.rearrangeFromDoubleVector 2048 thrpt 30 ops/ms 22.315 676.314 30.31x SelectFromBenchmark.rearrangeFromLongVector 1024 thrpt 30 ops/ms 46.372 1502.036 32.39x SelectFromBenchmark.rearrangeFromLongVector 2048 thrpt 30 ops/ms 23.361 749.133 32.07x VectorXXH3HashingBenchmark.hashingKernel 1024 thrpt 30 ops/ms 97.780 1759.061 17.99x VectorXXH3HashingBenchmark.hashingKernel 2048 thrpt 30 ops/ms 48.923 879.584 17.98x VectorXXH3HashingBenchmark.hashingKernel 4096 thrpt 30 ops/ms 24.219 439.588 18.15x VectorXXH3HashingBenchmark.hashingKernel 8192 thrpt 30 ops/ms 12.416 219.603 17.69x ``` [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64_vector.ad#L209 [2] https://bugs.openjdk.org/browse/JDK-8310691 --- src/hotspot/cpu/aarch64/aarch64_vector.ad | 65 ++++--------------- src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 | 65 ++++--------------- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 58 +++++++++++++++++ .../cpu/aarch64/c2_MacroAssembler_aarch64.hpp | 4 +- 4 files changed, 83 insertions(+), 109 deletions(-) diff --git a/src/hotspot/cpu/aarch64/aarch64_vector.ad b/src/hotspot/cpu/aarch64/aarch64_vector.ad index d217fb7e94987..71701196af538 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector.ad +++ b/src/hotspot/cpu/aarch64/aarch64_vector.ad @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -204,12 +204,6 @@ source %{ return false; } break; - case Op_VectorLoadShuffle: - case Op_VectorRearrange: - if (vlen < 4) { - return false; - } - break; case Op_ExpandV: if (UseSVE < 2 || is_subword_type(bt)) { return false; @@ -6156,61 +6150,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{ // ------------------------------ Vector rearrange ----------------------------- -// Here is an example that rearranges a NEON vector with 4 ints: -// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] -// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3]. -// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1]. -// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1]. -// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404] -// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. -// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100] -// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] -// 6. Use Vm as index register, and use V1 as table register. -// Then get V2 as the result by tbl NEON instructions. -// Notes: -// Step 1 matches VectorLoadConst. -// Step 3 matches VectorLoadShuffle. -// Step 4, 5, 6 match VectorRearrange. -// For VectorRearrange short/int, the reason why such complex calculation is -// required is because NEON tbl supports bytes table only, so for short/int, we -// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl -// to implement rearrange. - -// Maybe move the shuffle preparation to VectorLoadShuffle -instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE == 0 && - (Matcher::vector_element_basic_type(n) == T_SHORT || - (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 && - Matcher::vector_length_in_bytes(n) == 16))); +instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE); match(Set dst (VectorRearrange src shuffle)); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); - format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - if (bt == T_SHORT) { - uint length_in_bytes = Matcher::vector_length_in_bytes(this); - assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); - Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B; - Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H; - __ mov($tmp1$$FloatRegister, size1, 0x02); - __ mov($tmp2$$FloatRegister, size2, 0x0100); - __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister); - } else { - assert(bt == T_INT || bt == T_FLOAT, "unsupported type"); - __ mov($tmp1$$FloatRegister, __ T16B, 0x04); - __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100); - __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister); - } + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); + __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister, + $shuffle$$FloatRegister, $tmp$$FloatRegister, + bt, length_in_bytes == 16); %} ins_pipe(pipe_slow); %} instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ - predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0); + predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE); match(Set dst (VectorRearrange src shuffle)); format %{ "rearrange $dst, $src, $shuffle" %} ins_encode %{ diff --git a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 index 422e98d9b681a..575a37608fdc1 100644 --- a/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 +++ b/src/hotspot/cpu/aarch64/aarch64_vector_ad.m4 @@ -1,5 +1,5 @@ // -// Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. +// Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. // Copyright (c) 2020, 2024, Arm Limited. All rights reserved. // DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. // @@ -194,12 +194,6 @@ source %{ return false; } break; - case Op_VectorLoadShuffle: - case Op_VectorRearrange: - if (vlen < 4) { - return false; - } - break; case Op_ExpandV: if (UseSVE < 2 || is_subword_type(bt)) { return false; @@ -4403,61 +4397,24 @@ instruct vtest_alltrue_sve(rFlagsReg cr, pReg src1, pReg src2, pReg ptmp) %{ // ------------------------------ Vector rearrange ----------------------------- -// Here is an example that rearranges a NEON vector with 4 ints: -// Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] -// 1. Get the indices of V1 and store them as Vi byte[0, 1, 2, 3]. -// 2. Convert Vi byte[0, 1, 2, 3] to the indices of V2 and also store them as Vi byte[2, 3, 0, 1]. -// 3. Unsigned extend Long Vi from byte[2, 3, 0, 1] to int[2, 3, 0, 1]. -// 4. Multiply Vi int[2, 3, 0, 1] with constant int[0x04040404, 0x04040404, 0x04040404, 0x04040404] -// and get tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. -// 5. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100] -// and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] -// 6. Use Vm as index register, and use V1 as table register. -// Then get V2 as the result by tbl NEON instructions. -// Notes: -// Step 1 matches VectorLoadConst. -// Step 3 matches VectorLoadShuffle. -// Step 4, 5, 6 match VectorRearrange. -// For VectorRearrange short/int, the reason why such complex calculation is -// required is because NEON tbl supports bytes table only, so for short/int, we -// need to lookup 2/4 bytes as a group. For VectorRearrange long, we use bsl -// to implement rearrange. - -// Maybe move the shuffle preparation to VectorLoadShuffle -instruct rearrange_HS_neon(vReg dst, vReg src, vReg shuffle, vReg tmp1, vReg tmp2) %{ - predicate(UseSVE == 0 && - (Matcher::vector_element_basic_type(n) == T_SHORT || - (type2aelembytes(Matcher::vector_element_basic_type(n)) == 4 && - Matcher::vector_length_in_bytes(n) == 16))); +instruct rearrange_HSD_neon(vReg dst, vReg src, vReg shuffle, vReg tmp) %{ + predicate(UseSVE == 0 && Matcher::vector_element_basic_type(n) != T_BYTE); match(Set dst (VectorRearrange src shuffle)); - effect(TEMP_DEF dst, TEMP tmp1, TEMP tmp2); - format %{ "rearrange_HS_neon $dst, $src, $shuffle\t# vector (4S/8S/4I/4F). KILL $tmp1, $tmp2" %} + effect(TEMP_DEF dst, TEMP tmp); + format %{ "rearrange_HSD_neon $dst, $src, $shuffle\t# vector (4H/8H/2S/4S/2D). KILL $tmp" %} ins_encode %{ BasicType bt = Matcher::vector_element_basic_type(this); - if (bt == T_SHORT) { - uint length_in_bytes = Matcher::vector_length_in_bytes(this); - assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); - Assembler::SIMD_Arrangement size1 = length_in_bytes == 16 ? __ T16B : __ T8B; - Assembler::SIMD_Arrangement size2 = length_in_bytes == 16 ? __ T8H : __ T4H; - __ mov($tmp1$$FloatRegister, size1, 0x02); - __ mov($tmp2$$FloatRegister, size2, 0x0100); - __ mulv($dst$$FloatRegister, size2, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, size1, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, size1, $src$$FloatRegister, 1, $dst$$FloatRegister); - } else { - assert(bt == T_INT || bt == T_FLOAT, "unsupported type"); - __ mov($tmp1$$FloatRegister, __ T16B, 0x04); - __ mov($tmp2$$FloatRegister, __ T4S, 0x03020100); - __ mulv($dst$$FloatRegister, __ T4S, $shuffle$$FloatRegister, $tmp1$$FloatRegister); - __ addv($dst$$FloatRegister, __ T16B, $dst$$FloatRegister, $tmp2$$FloatRegister); - __ tbl($dst$$FloatRegister, __ T16B, $src$$FloatRegister, 1, $dst$$FloatRegister); - } + uint length_in_bytes = Matcher::vector_length_in_bytes(this); + assert(length_in_bytes == 8 || length_in_bytes == 16, "must be"); + __ neon_rearrange_hsd($dst$$FloatRegister, $src$$FloatRegister, + $shuffle$$FloatRegister, $tmp$$FloatRegister, + bt, length_in_bytes == 16); %} ins_pipe(pipe_slow); %} instruct rearrange(vReg dst, vReg src, vReg shuffle) %{ - predicate(Matcher::vector_element_basic_type(n) == T_BYTE || UseSVE > 0); + predicate(UseSVE > 0 || Matcher::vector_element_basic_type(n) == T_BYTE); match(Set dst (VectorRearrange src shuffle)); format %{ "rearrange $dst, $src, $shuffle" %} ins_encode %{ diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index e3d197a457215..2076e28f61ac6 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -2549,6 +2549,64 @@ void C2_MacroAssembler::neon_reverse_bytes(FloatRegister dst, FloatRegister src, } } +// VectorRearrange implementation for short/int/float/long/double types with NEON +// instructions. For VectorRearrange short/int/float, we use NEON tbl instruction. +// But since it supports bytes table only, we need to lookup 2/4 bytes as a group. +// For VectorRearrange long/double, we compare the shuffle input with iota indices, +// and use bsl to implement the operation. +void C2_MacroAssembler::neon_rearrange_hsd(FloatRegister dst, FloatRegister src, + FloatRegister shuffle, FloatRegister tmp, + BasicType bt, bool isQ) { + assert_different_registers(dst, src, shuffle, tmp); + SIMD_Arrangement size1 = isQ ? T16B : T8B; + SIMD_Arrangement size2 = esize2arrangement((uint)type2aelembytes(bt), isQ); + + // Here is an example that rearranges a NEON vector with 4 ints: + // Rearrange V1 int[a0, a1, a2, a3] to V2 int[a2, a3, a0, a1] + // 1. We assume the shuffle input is Vi int[2, 3, 0, 1]. + // 2. Multiply Vi int[2, 3, 0, 1] with constant int vector + // [0x04040404, 0x04040404, 0x04040404, 0x04040404], and get + // tbl base Vm int[0x08080808, 0x0c0c0c0c, 0x00000000, 0x04040404]. + // 3. Add Vm with constant int[0x03020100, 0x03020100, 0x03020100, 0x03020100], + // and get tbl index Vm int[0x0b0a0908, 0x0f0e0d0c, 0x03020100, 0x07060504] + // 4. Use Vm as index register, and use V1 as table register. + // Then get V2 as the result by tbl NEON instructions. + switch (bt) { + case T_SHORT: + mov(tmp, size1, 0x02); + mulv(dst, size2, shuffle, tmp); + mov(tmp, size2, 0x0100); + addv(dst, size1, dst, tmp); + tbl(dst, size1, src, 1, dst); + break; + case T_INT: + case T_FLOAT: + mov(tmp, size1, 0x04); + mulv(dst, size2, shuffle, tmp); + mov(tmp, size2, 0x03020100); + addv(dst, size1, dst, tmp); + tbl(dst, size1, src, 1, dst); + break; + case T_LONG: + case T_DOUBLE: + // Load the iota indices for Long type. The indices are ordered by + // type B/S/I/L/F/D, and the offset between two types is 16; Hence + // the offset for L is 48. + lea(rscratch1, + ExternalAddress(StubRoutines::aarch64::vector_iota_indices() + 48)); + ldrq(tmp, rscratch1); + // Check whether the input "shuffle" is the same with iota indices. + // Return "src" if true, otherwise swap the two elements of "src". + cm(EQ, dst, size2, shuffle, tmp); + ext(tmp, size1, src, src, 8); + bsl(dst, size1, src, tmp); + break; + default: + assert(false, "unsupported element type"); + ShouldNotReachHere(); + } +} + // Extract a scalar element from an sve vector at position 'idx'. // The input elements in src are expected to be of integral type. void C2_MacroAssembler::sve_extract_integral(Register dst, BasicType bt, FloatRegister src, diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp index d61b050407d21..e0eaa0b76e6e9 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020, 2025, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -179,6 +179,8 @@ void neon_reverse_bytes(FloatRegister dst, FloatRegister src, BasicType bt, bool isQ); + void neon_rearrange_hsd(FloatRegister dst, FloatRegister src, FloatRegister shuffle, + FloatRegister tmp, BasicType bt, bool isQ); // java.lang.Math::signum intrinsics void vector_signum_neon(FloatRegister dst, FloatRegister src, FloatRegister zero, FloatRegister one, SIMD_Arrangement T); From c0ebfa435ca1a9fbf30f8e3b8566ebdf4de914fc Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Fri, 14 Mar 2025 09:33:57 +0000 Subject: [PATCH 2/4] Add the IR test --- .../compiler/lib/ir_framework/IRNode.java | 30 +++ .../vectorapi/VectorRearrangeTest.java | 222 ++++++++++++++++++ 2 files changed, 252 insertions(+) create mode 100644 test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java diff --git a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java index 8f28294a98685..dcc9186660dea 100644 --- a/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java +++ b/test/hotspot/jtreg/compiler/lib/ir_framework/IRNode.java @@ -329,6 +329,36 @@ public class IRNode { superWordNodes(ADD_REDUCTION_VL, "AddReductionVL"); } + public static final String REARRANGE_VB = VECTOR_PREFIX + "REARRANGE_VB" + POSTFIX; + static { + vectorNode(REARRANGE_VB, "VectorRearrange", TYPE_BYTE); + } + + public static final String REARRANGE_VS = VECTOR_PREFIX + "REARRANGE_VS" + POSTFIX; + static { + vectorNode(REARRANGE_VS, "VectorRearrange", TYPE_SHORT); + } + + public static final String REARRANGE_VI = VECTOR_PREFIX + "REARRANGE_VI" + POSTFIX; + static { + vectorNode(REARRANGE_VI, "VectorRearrange", TYPE_INT); + } + + public static final String REARRANGE_VL = VECTOR_PREFIX + "REARRANGE_VL" + POSTFIX; + static { + vectorNode(REARRANGE_VL, "VectorRearrange", TYPE_LONG); + } + + public static final String REARRANGE_VF = VECTOR_PREFIX + "REARRANGE_VF" + POSTFIX; + static { + vectorNode(REARRANGE_VF, "VectorRearrange", TYPE_FLOAT); + } + + public static final String REARRANGE_VD = VECTOR_PREFIX + "REARRANGE_VD" + POSTFIX; + static { + vectorNode(REARRANGE_VD, "VectorRearrange", TYPE_DOUBLE); + } + public static final String ADD_P_OF = COMPOSITE_PREFIX + "ADD_P_OF" + POSTFIX; static { String regex = START + "addP_" + IS_REPLACED + MID + ".*" + END; diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java new file mode 100644 index 0000000000000..97ee86544ae31 --- /dev/null +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java @@ -0,0 +1,222 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8350463 + * @summary AArch64: Add vector rearrange support for small lane count vectors + * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx.*") | os.arch=="aarch64" + * @modules jdk.incubator.vector + * @library /test/lib / + * + * @run driver compiler.vectorapi.VectorRearrangeTest + */ + +package compiler.vectorapi; + +import jdk.incubator.vector.*; +import compiler.lib.ir_framework.*; +import java.util.Random; +import jdk.test.lib.Utils; + +public class VectorRearrangeTest { + private static final int LENGTH = 2048; + private static final Random random = Utils.getRandomInstance(); + + private static final VectorSpecies bspec128 = ByteVector.SPECIES_128; + private static final VectorSpecies sspec128 = ShortVector.SPECIES_128; + private static final VectorSpecies ispec128 = IntVector.SPECIES_128; + private static final VectorSpecies lspec128 = LongVector.SPECIES_128; + private static final VectorSpecies fspec128 = FloatVector.SPECIES_128; + private static final VectorSpecies dspec128 = DoubleVector.SPECIES_128; + private static final VectorSpecies bspec64 = ByteVector.SPECIES_64; + private static final VectorSpecies sspec64 = ShortVector.SPECIES_64; + private static final VectorSpecies ispec64 = IntVector.SPECIES_64; + private static final VectorSpecies fspec64 = FloatVector.SPECIES_64; + + private static byte[] bsrc; + private static short[] ssrc; + private static int[] isrc; + private static long[] lsrc; + private static float[] fsrc; + private static double[] dsrc; + + private static byte[] bdst; + private static short[] sdst; + private static int[] idst; + private static long[] ldst; + private static float[] fdst; + private static double[] ddst; + + private static int[][] indexes; + + static { + bsrc = new byte[LENGTH]; + ssrc = new short[LENGTH]; + isrc = new int[LENGTH]; + lsrc = new long[LENGTH]; + fsrc = new float[LENGTH]; + dsrc = new double[LENGTH]; + bdst = new byte[LENGTH]; + sdst = new short[LENGTH]; + idst = new int[LENGTH]; + ldst = new long[LENGTH]; + fdst = new float[LENGTH]; + ddst = new double[LENGTH]; + + for (int i = 0; i < LENGTH; ++i) { + bsrc[i] = (byte)random.nextInt(); + ssrc[i] = (short)random.nextInt(); + isrc[i] = random.nextInt(); + lsrc[i] = random.nextLong(); + fsrc[i] = random.nextFloat(); + dsrc[i] = random.nextDouble(); + } + + int[] nums = {2, 4, 8, 16}; + indexes = new int[4][]; + for (int i = 0; i < 4; i++) { + indexes[i] = new int[nums[i]]; + for (int j = 0; j < nums[i]; j++) { + indexes[i][j] = random.nextInt() & (nums[i] - 1); + } + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "}) + public void rearrange_byte64() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspec64, indexes[2], 0); + for (int i = 0; i < LENGTH; i += bspec64.length()) { + ByteVector.fromArray(bspec64, bsrc, i) + .rearrange(shuffle) + .intoArray(bdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "}) + public void rearrange_byte128() { + VectorShuffle shuffle = VectorShuffle.fromArray(bspec128, indexes[3], 0); + for (int i = 0; i < LENGTH; i += bspec128.length()) { + ByteVector.fromArray(bspec128, bsrc, i) + .rearrange(shuffle) + .intoArray(bdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "}) + public void rearrange_short64() { + VectorShuffle shuffle = VectorShuffle.fromArray(sspec64, indexes[1], 0); + for (int i = 0; i < LENGTH; i += sspec64.length()) { + ShortVector.fromArray(sspec64, ssrc, i) + .rearrange(shuffle) + .intoArray(sdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "}) + public void rearrange_short128() { + VectorShuffle shuffle = VectorShuffle.fromArray(sspec128, indexes[2], 0); + for (int i = 0; i < LENGTH; i += sspec128.length()) { + ShortVector.fromArray(sspec128, ssrc, i) + .rearrange(shuffle) + .intoArray(sdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_int64() { + VectorShuffle shuffle = VectorShuffle.fromArray(ispec64, indexes[0], 0); + for (int i = 0; i < LENGTH; i += ispec64.length()) { + IntVector.fromArray(ispec64, isrc, i) + .rearrange(shuffle) + .intoArray(idst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "}) + public void rearrange_int128() { + VectorShuffle shuffle = VectorShuffle.fromArray(ispec128, indexes[1], 0); + for (int i = 0; i < LENGTH; i += ispec128.length()) { + IntVector.fromArray(ispec128, isrc, i) + .rearrange(shuffle) + .intoArray(idst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VL, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_long128() { + VectorShuffle shuffle = VectorShuffle.fromArray(lspec128, indexes[0], 0); + for (int i = 0; i < LENGTH; i += lspec128.length()) { + LongVector.fromArray(lspec128, lsrc, i) + .rearrange(shuffle) + .intoArray(ldst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_float64() { + VectorShuffle shuffle = VectorShuffle.fromArray(fspec64, indexes[0], 0); + for (int i = 0; i < LENGTH; i += fspec64.length()) { + FloatVector.fromArray(fspec64, fsrc, i) + .rearrange(shuffle) + .intoArray(fdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "}) + public void rearrange_float128() { + VectorShuffle shuffle = VectorShuffle.fromArray(fspec128, indexes[1], 0); + for (int i = 0; i < LENGTH; i += fspec128.length()) { + FloatVector.fromArray(fspec128, fsrc, i) + .rearrange(shuffle) + .intoArray(fdst, i); + } + } + + @Test + @IR(counts = {IRNode.REARRANGE_VD, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) + public void rearrange_double128() { + VectorShuffle shuffle = VectorShuffle.fromArray(dspec128, indexes[0], 0); + for (int i = 0; i < LENGTH; i += dspec128.length()) { + DoubleVector.fromArray(dspec128, dsrc, i) + .rearrange(shuffle) + .intoArray(ddst, i); + } + } + + public static void main(String[] args) { + TestFramework testFramework = new TestFramework(); + testFramework.setDefaultWarmup(10000) + .addFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation") + .start(); + } +} From 1cbff61fbcf4c289048abcc076be7f3a98f4ac03 Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Tue, 18 Mar 2025 02:31:47 +0000 Subject: [PATCH 3/4] Update IR test based on the review comment --- .../vectorapi/VectorRearrangeTest.java | 133 +++++++++++++++--- 1 file changed, 111 insertions(+), 22 deletions(-) diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java index 97ee86544ae31..a2638504235ea 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java @@ -25,7 +25,6 @@ * @test * @bug 8350463 * @summary AArch64: Add vector rearrange support for small lane count vectors - * @requires (os.simpleArch == "x64" & vm.cpu.features ~= ".*avx.*") | os.arch=="aarch64" * @modules jdk.incubator.vector * @library /test/lib / * @@ -34,14 +33,14 @@ package compiler.vectorapi; -import jdk.incubator.vector.*; +import compiler.lib.generators.*; import compiler.lib.ir_framework.*; -import java.util.Random; -import jdk.test.lib.Utils; +import jdk.incubator.vector.*; +import jdk.test.lib.Asserts; public class VectorRearrangeTest { private static final int LENGTH = 2048; - private static final Random random = Utils.getRandomInstance(); + private static final Generators random = Generators.G; private static final VectorSpecies bspec128 = ByteVector.SPECIES_128; private static final VectorSpecies sspec128 = ShortVector.SPECIES_128; @@ -84,27 +83,27 @@ public class VectorRearrangeTest { fdst = new float[LENGTH]; ddst = new double[LENGTH]; - for (int i = 0; i < LENGTH; ++i) { - bsrc[i] = (byte)random.nextInt(); - ssrc[i] = (short)random.nextInt(); - isrc[i] = random.nextInt(); - lsrc[i] = random.nextLong(); - fsrc[i] = random.nextFloat(); - dsrc[i] = random.nextDouble(); + Generator byteGen = random.uniformInts(Byte.MIN_VALUE, Byte.MAX_VALUE); + Generator shortGen = random.uniformInts(Short.MIN_VALUE, Short.MAX_VALUE); + for (int i = 0; i < LENGTH; i++) { + bsrc[i] = byteGen.next().byteValue(); + ssrc[i] = shortGen.next().shortValue(); } + random.fill(random.ints(), isrc); + random.fill(random.longs(), lsrc); + random.fill(random.floats(), fsrc); + random.fill(random.doubles(), dsrc); int[] nums = {2, 4, 8, 16}; indexes = new int[4][]; for (int i = 0; i < 4; i++) { indexes[i] = new int[nums[i]]; - for (int j = 0; j < nums[i]; j++) { - indexes[i][j] = random.nextInt() & (nums[i] - 1); - } + random.fill(random.uniformInts(0, nums[i] - 1), indexes[i]); } } @Test - @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_byte64() { VectorShuffle shuffle = VectorShuffle.fromArray(bspec64, indexes[2], 0); for (int i = 0; i < LENGTH; i += bspec64.length()) { @@ -114,8 +113,17 @@ public void rearrange_byte64() { } } + @Check(test = "rearrange_byte64") + public void rearrange_byte64_verify() { + for (int i = 0; i < LENGTH; i += bspec64.length()) { + for (int j = 0; j < bspec64.length(); j++) { + Asserts.assertEquals(bsrc[indexes[2][j] + i], bdst[i + j]); + } + } + } + @Test - @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VB, IRNode.VECTOR_SIZE_16, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_byte128() { VectorShuffle shuffle = VectorShuffle.fromArray(bspec128, indexes[3], 0); for (int i = 0; i < LENGTH; i += bspec128.length()) { @@ -125,8 +133,17 @@ public void rearrange_byte128() { } } + @Check(test = "rearrange_byte128") + public void rearrange_byte128_verify() { + for (int i = 0; i < LENGTH; i += bspec128.length()) { + for (int j = 0; j < bspec128.length(); j++) { + Asserts.assertEquals(bsrc[indexes[3][j] + i], bdst[i + j]); + } + } + } + @Test - @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_short64() { VectorShuffle shuffle = VectorShuffle.fromArray(sspec64, indexes[1], 0); for (int i = 0; i < LENGTH; i += sspec64.length()) { @@ -136,8 +153,17 @@ public void rearrange_short64() { } } + @Check(test = "rearrange_short64") + public void rearrange_short64_verify() { + for (int i = 0; i < LENGTH; i += sspec64.length()) { + for (int j = 0; j < sspec64.length(); j++) { + Asserts.assertEquals(ssrc[indexes[1][j] + i], sdst[i + j]); + } + } + } + @Test - @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VS, IRNode.VECTOR_SIZE_8, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_short128() { VectorShuffle shuffle = VectorShuffle.fromArray(sspec128, indexes[2], 0); for (int i = 0; i < LENGTH; i += sspec128.length()) { @@ -147,6 +173,15 @@ public void rearrange_short128() { } } + @Check(test = "rearrange_short128") + public void rearrange_short128_verify() { + for (int i = 0; i < LENGTH; i += sspec128.length()) { + for (int j = 0; j < sspec128.length(); j++) { + Asserts.assertEquals(ssrc[indexes[2][j] + i], sdst[i + j]); + } + } + } + @Test @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) public void rearrange_int64() { @@ -158,8 +193,17 @@ public void rearrange_int64() { } } + @Check(test = "rearrange_int64") + public void rearrange_int64_verify() { + for (int i = 0; i < LENGTH; i += ispec64.length()) { + for (int j = 0; j < ispec64.length(); j++) { + Asserts.assertEquals(isrc[indexes[0][j] + i], idst[i + j]); + } + } + } + @Test - @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VI, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_int128() { VectorShuffle shuffle = VectorShuffle.fromArray(ispec128, indexes[1], 0); for (int i = 0; i < LENGTH; i += ispec128.length()) { @@ -169,6 +213,15 @@ public void rearrange_int128() { } } + @Check(test = "rearrange_int128") + public void rearrange_int128_verify() { + for (int i = 0; i < LENGTH; i += ispec128.length()) { + for (int j = 0; j < ispec128.length(); j++) { + Asserts.assertEquals(isrc[indexes[1][j] + i], idst[i + j]); + } + } + } + @Test @IR(counts = {IRNode.REARRANGE_VL, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) public void rearrange_long128() { @@ -180,6 +233,15 @@ public void rearrange_long128() { } } + @Check(test = "rearrange_long128") + public void rearrange_long128_verify() { + for (int i = 0; i < LENGTH; i += lspec128.length()) { + for (int j = 0; j < lspec128.length(); j++) { + Asserts.assertEquals(lsrc[indexes[0][j] + i], ldst[i + j]); + } + } + } + @Test @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) public void rearrange_float64() { @@ -191,8 +253,17 @@ public void rearrange_float64() { } } + @Check(test = "rearrange_float64") + public void rearrange_float64_verify() { + for (int i = 0; i < LENGTH; i += fspec64.length()) { + for (int j = 0; j < fspec64.length(); j++) { + Asserts.assertEquals(fsrc[indexes[0][j] + i], fdst[i + j]); + } + } + } + @Test - @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "}) + @IR(counts = {IRNode.REARRANGE_VF, IRNode.VECTOR_SIZE_4, " >0 "}, applyIfCPUFeatureOr = {"avx", "true", "asimd", "true"}) public void rearrange_float128() { VectorShuffle shuffle = VectorShuffle.fromArray(fspec128, indexes[1], 0); for (int i = 0; i < LENGTH; i += fspec128.length()) { @@ -202,6 +273,15 @@ public void rearrange_float128() { } } + @Check(test = "rearrange_float128") + public void rearrange_float128_verify() { + for (int i = 0; i < LENGTH; i += fspec128.length()) { + for (int j = 0; j < fspec128.length(); j++) { + Asserts.assertEquals(fsrc[indexes[1][j] + i], fdst[i + j]); + } + } + } + @Test @IR(counts = {IRNode.REARRANGE_VD, IRNode.VECTOR_SIZE_2, " >0 "}, applyIfCPUFeature = {"asimd", "true"}) public void rearrange_double128() { @@ -213,10 +293,19 @@ public void rearrange_double128() { } } + @Check(test = "rearrange_double128") + public void rearrange_double128_verify() { + for (int i = 0; i < LENGTH; i += dspec128.length()) { + for (int j = 0; j < dspec128.length(); j++) { + Asserts.assertEquals(dsrc[indexes[0][j] + i], ddst[i + j]); + } + } + } + public static void main(String[] args) { TestFramework testFramework = new TestFramework(); testFramework.setDefaultWarmup(10000) - .addFlags("--add-modules=jdk.incubator.vector", "-XX:-TieredCompilation") + .addFlags("--add-modules=jdk.incubator.vector") .start(); } } From 5249c9ec408dd5a8341b3ccb0dacb09c6e09220d Mon Sep 17 00:00:00 2001 From: Xiaohong Gong Date: Thu, 20 Mar 2025 02:40:18 +0000 Subject: [PATCH 4/4] Use a smaller warmup and array length in IR test --- .../hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java index a2638504235ea..f2d172b888812 100644 --- a/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java +++ b/test/hotspot/jtreg/compiler/vectorapi/VectorRearrangeTest.java @@ -39,7 +39,7 @@ import jdk.test.lib.Asserts; public class VectorRearrangeTest { - private static final int LENGTH = 2048; + private static final int LENGTH = 1024; private static final Generators random = Generators.G; private static final VectorSpecies bspec128 = ByteVector.SPECIES_128; @@ -304,7 +304,7 @@ public void rearrange_double128_verify() { public static void main(String[] args) { TestFramework testFramework = new TestFramework(); - testFramework.setDefaultWarmup(10000) + testFramework.setDefaultWarmup(5000) .addFlags("--add-modules=jdk.incubator.vector") .start(); }