Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8266720: Wrong implementation in LibraryCallKit::inline_vector_shuffle_iota #81

Open
wants to merge 4 commits into
base: vectorIntrinsics
Choose a base branch
from
Open
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -204,9 +204,13 @@ source %{
case BoolTest::eq: masm.sve_cmpeq(pd, size, pg, zn, zm); break;
case BoolTest::ne: masm.sve_cmpne(pd, size, pg, zn, zm); break;
case BoolTest::ge: masm.sve_cmpge(pd, size, pg, zn, zm); break;
case BoolTest::uge: masm.sve_cmphs(pd, size, pg, zn, zm); break;
case BoolTest::gt: masm.sve_cmpgt(pd, size, pg, zn, zm); break;
case BoolTest::ugt: masm.sve_cmphi(pd, size, pg, zn, zm); break;
case BoolTest::le: masm.sve_cmpge(pd, size, pg, zm, zn); break;
case BoolTest::ule: masm.sve_cmphs(pd, size, pg, zm, zn); break;
case BoolTest::lt: masm.sve_cmpgt(pd, size, pg, zm, zn); break;
case BoolTest::ult: masm.sve_cmphi(pd, size, pg, zm, zn); break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
@@ -199,9 +199,13 @@ source %{
case BoolTest::eq: masm.sve_cmpeq(pd, size, pg, zn, zm); break;
case BoolTest::ne: masm.sve_cmpne(pd, size, pg, zn, zm); break;
case BoolTest::ge: masm.sve_cmpge(pd, size, pg, zn, zm); break;
case BoolTest::uge: masm.sve_cmphs(pd, size, pg, zn, zm); break;
case BoolTest::gt: masm.sve_cmpgt(pd, size, pg, zn, zm); break;
case BoolTest::ugt: masm.sve_cmphi(pd, size, pg, zn, zm); break;
case BoolTest::le: masm.sve_cmpge(pd, size, pg, zm, zn); break;
case BoolTest::ule: masm.sve_cmphs(pd, size, pg, zm, zn); break;
case BoolTest::lt: masm.sve_cmpgt(pd, size, pg, zm, zn); break;
case BoolTest::ult: masm.sve_cmphi(pd, size, pg, zm, zn); break;
default:
assert(false, "unsupported");
ShouldNotReachHere();
@@ -3300,7 +3300,9 @@ void mvnw(Register Rd, Register Rm,
INSN(sve_cmpeq, 0b00100100, 0b1010, 0);
INSN(sve_cmpne, 0b00100100, 0b1011, 0);
INSN(sve_cmpge, 0b00100100, 0b1000, 0);
INSN(sve_cmphs, 0b00100100, 0b0000, 0);
INSN(sve_cmpgt, 0b00100100, 0b1001, 0);
INSN(sve_cmphi, 0b00100100, 0b0001, 0);
INSN(sve_fcmeq, 0b01100101, 0b0110, 1);
INSN(sve_fcmne, 0b01100101, 0b0111, 1);
INSN(sve_fcmgt, 0b01100101, 0b0101, 1);
@@ -453,10 +453,10 @@ bool LibraryCallKit::inline_vector_shuffle_iota() {
// Wrap the indices greater than lane count.
res = gvn().transform(VectorNode::make(Op_AndI, res, bcast_mod, num_elem, elem_bt));
} else {
ConINode* pred_node = (ConINode*)gvn().makecon(TypeInt::make(BoolTest::ge));
ConINode* pred_node = (ConINode*)gvn().makecon(TypeInt::make(BoolTest::ugt));
Copy link
Collaborator

@sviswa7 sviswa7 May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unsigned comparison adds overhead and is not supported on all architectures.

Copy link
Collaborator Author

@Wanghuang-Huawei Wanghuang-Huawei May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After changing notes with @XiaohongGong , I think we can also fix like this:

    ConINode* pred_node = (ConINode*)gvn().makecon(TypeInt::make(BoolTest::ge));
    Node * lane_cnt_tmp  = gvn().makecon(TypeInt::make(num_elem - 1));
    Node * bcast_lane_cnt_tmp = gvn().transform(VectorNode::scalar2vector(lane_cnt_tmp, num_elem, type_bt));
    Node* mask = gvn().transform(new VectorMaskCmpNode(BoolTest::ge, bcast_lane_cnt_tmp, res, pred_node, vt));

    // Make the indices greater than lane count as -ve values. This matches the java side implementation.
    res = gvn().transform(VectorNode::make(Op_AndI, res, bcast_mod, num_elem, elem_bt));
    Node * lane_cnt  = gvn().makecon(TypeInt::make(num_elem)); // Add a mov & bcast here
    Node * bcast_lane_cnt = gvn().transform(VectorNode::scalar2vector(lane_cnt, num_elem, type_bt));
    Node * biased_val = gvn().transform(VectorNode::make(Op_SubI, res, bcast_lane_cnt, num_elem, elem_bt));
    res = gvn().transform(new VectorBlendNode(biased_val, res, mask));

Copy link
Collaborator Author

@Wanghuang-Huawei Wanghuang-Huawei May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unsigned comparison adds overhead and is not supported on all architectures.

However, if we don't use ugt ,we will encounter problem if length > 1024 in future. Changing < num_elem to <= 128 is just a solution to 1024 itself. If num_elem > 128, it will be invalid.

Copy link
Collaborator

@XiaohongGong XiaohongGong May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently making it work well for <= 1024-bits makes sense to me. We can revisit this issue after the API issues for vector length > 1024-bits are fixed in future.

Copy link
Collaborator

@nsjian nsjian May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently making it work well for <= 1024-bits makes sense to me. We can revisit this issue after the API issues for vector length > 1024-bits are fixed in future.

If so, we need at least some comments or even length check to not inline for unsupported vector lengths?

Copy link
Collaborator

@XiaohongGong XiaohongGong May 14, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agree with @nsjian ! Thanks!

Copy link
Collaborator Author

@Wanghuang-Huawei Wanghuang-Huawei May 17, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If we use ge 1024 , we will add two more nodes. It is a extra cost here?

Node * lane_cnt = gvn().makecon(TypeInt::make(num_elem));
Node * bcast_lane_cnt = gvn().transform(VectorNode::scalar2vector(lane_cnt, num_elem, type_bt));
Node* mask = gvn().transform(new VectorMaskCmpNode(BoolTest::ge, bcast_lane_cnt, res, pred_node, vt));
Node* mask = gvn().transform(new VectorMaskCmpNode(BoolTest::ugt, bcast_lane_cnt, res, pred_node, vt));

// Make the indices greater than lane count as -ve values. This matches the java side implementation.
res = gvn().transform(VectorNode::make(Op_AndI, res, bcast_mod, num_elem, elem_bt));
@@ -1563,7 +1563,9 @@ def generate(kind, names):
["cmpeq", "__ sve_cmpeq(p1, __ B, p0, z0, z1);", "cmpeq\tp1.b, p0/z, z0.b, z1.b"],
["cmpne", "__ sve_cmpne(p1, __ H, p0, z2, z3);", "cmpne\tp1.h, p0/z, z2.h, z3.h"],
["cmpge", "__ sve_cmpge(p1, __ S, p2, z4, z5);", "cmpge\tp1.s, p2/z, z4.s, z5.s"],
["cmphs", "__ sve_cmphs(p1, __ S, p2, z4, z5);", "cmphs\tp1.s, p2/z, z4.s, z5.s"],
["cmpgt", "__ sve_cmpgt(p1, __ D, p3, z6, z7);", "cmpgt\tp1.d, p3/z, z6.d, z7.d"],
["cmphi", "__ sve_cmphi(p1, __ D, p3, z6, z7);", "cmphi\tp1.d, p3/z, z6.d, z7.d"],
["cmple", "__ sve_cmpge(p2, __ B, p0, z10, z11);", "cmple\tp2.b, p0/z, z11.b, z10.b"],
["cmplt", "__ sve_cmpgt(p3, __ S, p0, z16, z17);", "cmplt\tp3.s, p0/z, z17.s, z16.s"],
["cmpeq", "__ sve_cmpeq(p1, __ B, p4, z0, 15);", "cmpeq\tp1.b, p4/z, z0.b, #15"],
@@ -750,7 +750,9 @@
__ sve_cmpeq(p1, __ B, p0, z0, z1); // cmpeq p1.b, p0/z, z0.b, z1.b
__ sve_cmpne(p1, __ H, p0, z2, z3); // cmpne p1.h, p0/z, z2.h, z3.h
__ sve_cmpge(p1, __ S, p2, z4, z5); // cmpge p1.s, p2/z, z4.s, z5.s
__ sve_cmphs(p1, __ S, p2, z4, z5); // cmphs p1.s, p2/z, z4.s, z5.s
__ sve_cmpgt(p1, __ D, p3, z6, z7); // cmpgt p1.d, p3/z, z6.d, z7.d
__ sve_cmphi(p1, __ D, p3, z6, z7); // cmphi p1.d, p3/z, z6.d, z7.d
__ sve_cmpge(p2, __ B, p0, z10, z11); // cmple p2.b, p0/z, z11.b, z10.b
__ sve_cmpgt(p3, __ S, p0, z16, z17); // cmplt p3.s, p0/z, z17.s, z16.s
__ sve_cmpeq(p1, __ B, p4, z0, 15); // cmpeq p1.b, p4/z, z0.b, #15
@@ -999,30 +1001,30 @@
0x9101a1a0, 0xb10a5cc8, 0xd10810aa, 0xf10fd061,
0x120cb166, 0x321764bc, 0x52174681, 0x720c0227,
0x9241018e, 0xb25a2969, 0xd278b411, 0xf26aad01,
0x14000000, 0x17ffffd7, 0x1400031f, 0x94000000,
0x97ffffd4, 0x9400031c, 0x3400000a, 0x34fffa2a,
0x3400632a, 0x35000008, 0x35fff9c8, 0x350062c8,
0xb400000b, 0xb4fff96b, 0xb400626b, 0xb500001d,
0xb5fff91d, 0xb500621d, 0x10000013, 0x10fff8b3,
0x100061b3, 0x90000013, 0x36300016, 0x3637f836,
0x36306136, 0x3758000c, 0x375ff7cc, 0x375860cc,
0x14000000, 0x17ffffd7, 0x14000323, 0x94000000,
0x97ffffd4, 0x94000320, 0x3400000a, 0x34fffa2a,
0x340063aa, 0x35000008, 0x35fff9c8, 0x35006348,
0xb400000b, 0xb4fff96b, 0xb40062eb, 0xb500001d,
0xb5fff91d, 0xb500629d, 0x10000013, 0x10fff8b3,
0x10006233, 0x90000013, 0x36300016, 0x3637f836,
0x363061b6, 0x3758000c, 0x375ff7cc, 0x3758614c,
0x128313a0, 0x528a32c7, 0x7289173b, 0x92ab3acc,
0xd2a0bf94, 0xf2c285e8, 0x9358722f, 0x330e652f,
0x53067f3b, 0x93577c53, 0xb34a1aac, 0xd35a4016,
0x13946c63, 0x93c3dbc8, 0x54000000, 0x54fff5a0,
0x54005ea0, 0x54000001, 0x54fff541, 0x54005e41,
0x54000002, 0x54fff4e2, 0x54005de2, 0x54000002,
0x54fff482, 0x54005d82, 0x54000003, 0x54fff423,
0x54005d23, 0x54000003, 0x54fff3c3, 0x54005cc3,
0x54000004, 0x54fff364, 0x54005c64, 0x54000005,
0x54fff305, 0x54005c05, 0x54000006, 0x54fff2a6,
0x54005ba6, 0x54000007, 0x54fff247, 0x54005b47,
0x54000008, 0x54fff1e8, 0x54005ae8, 0x54000009,
0x54fff189, 0x54005a89, 0x5400000a, 0x54fff12a,
0x54005a2a, 0x5400000b, 0x54fff0cb, 0x540059cb,
0x5400000c, 0x54fff06c, 0x5400596c, 0x5400000d,
0x54fff00d, 0x5400590d, 0x5400000e, 0x54ffefae,
0x540058ae, 0x5400000f, 0x54ffef4f, 0x5400584f,
0x54005f20, 0x54000001, 0x54fff541, 0x54005ec1,
0x54000002, 0x54fff4e2, 0x54005e62, 0x54000002,
0x54fff482, 0x54005e02, 0x54000003, 0x54fff423,
0x54005da3, 0x54000003, 0x54fff3c3, 0x54005d43,
0x54000004, 0x54fff364, 0x54005ce4, 0x54000005,
0x54fff305, 0x54005c85, 0x54000006, 0x54fff2a6,
0x54005c26, 0x54000007, 0x54fff247, 0x54005bc7,
0x54000008, 0x54fff1e8, 0x54005b68, 0x54000009,
0x54fff189, 0x54005b09, 0x5400000a, 0x54fff12a,
0x54005aaa, 0x5400000b, 0x54fff0cb, 0x54005a4b,
0x5400000c, 0x54fff06c, 0x540059ec, 0x5400000d,
0x54fff00d, 0x5400598d, 0x5400000e, 0x54ffefae,
0x5400592e, 0x5400000f, 0x54ffef4f, 0x540058cf,
0xd40658e1, 0xd4014d22, 0xd4046543, 0xd4273f60,
0xd44cad80, 0xd503201f, 0xd69f03e0, 0xd6bf03e0,
0xd5033fdf, 0xd5033e9f, 0xd50332bf, 0xd61f0200,
@@ -1054,7 +1056,7 @@
0x791f226d, 0xf95aa2f3, 0xb9587bb7, 0x395f7176,
0x795d9143, 0x399e7e08, 0x799a2697, 0x79df3422,
0xb99c2624, 0xfd5c2374, 0xbd5fa1d9, 0xfd1d595a,
0xbd1b1869, 0x5800489b, 0x1800000b, 0xf8945060,
0xbd1b1869, 0x5800491b, 0x1800000b, 0xf8945060,
0xd8000000, 0xf8ae6ba0, 0xf99a0080, 0x1a070035,
0x3a0700a8, 0x5a0e0367, 0x7a11009b, 0x9a000380,
0xba1e030c, 0xda0f0320, 0xfa030301, 0x0b340b11,
@@ -1148,17 +1150,18 @@
0xe4484be0, 0xe460efe0, 0xe547e400, 0xe4014be0,
0xe4a84fe0, 0xe5f15000, 0x858043e0, 0x85a043ff,
0xe59f5d08, 0x0522c020, 0x05e6c0a4, 0x2401a001,
0x2443a051, 0x24858881, 0x24c78cd1, 0x240b8142,
0x24918213, 0x250f9001, 0x25508051, 0x25802491,
0x25df28c1, 0x25850c81, 0x251e10d1, 0x65816001,
0x65c36051, 0x65854891, 0x65c74cc1, 0x658b4152,
0x65d14203, 0x05733820, 0x05b238a4, 0x05f138e6,
0x0570396a, 0x25221420, 0x25640461, 0x25a614b2,
0x25eb0553, 0x25221c24, 0x25640c60, 0x25a61cb1,
0x25eb0d52, 0x65d0a001, 0x65d1a443, 0x65cbac85,
0x65deaa53, 0x65dfaa53, 0x0520a1e0, 0x0521a601,
0x052281e0, 0x05238601, 0x04a14026, 0x0568aca7,
0x05b23230, 0x853040af, 0xc5b040af, 0x1e601000,
0x2443a051, 0x24858881, 0x24850881, 0x24c78cd1,
0x24c70cd1, 0x240b8142, 0x24918213, 0x250f9001,
0x25508051, 0x25802491, 0x25df28c1, 0x25850c81,
0x251e10d1, 0x65816001, 0x65c36051, 0x65854891,
0x65c74cc1, 0x658b4152, 0x65d14203, 0x05733820,
0x05b238a4, 0x05f138e6, 0x0570396a, 0x25221420,
0x25640461, 0x25a614b2, 0x25eb0553, 0x25221c24,
0x25640c60, 0x25a61cb1, 0x25eb0d52, 0x65d0a001,
0x65d1a443, 0x65cbac85, 0x65deaa53, 0x65dfaa53,
0x0520a1e0, 0x0521a601, 0x052281e0, 0x05238601,
0x04a14026, 0x0568aca7, 0x05b23230, 0x853040af,
0xc5b040af, 0xe57080af, 0xe5b080af, 0x1e601000,
0x1e603000, 0x1e621000, 0x1e623000, 0x1e641000,
0x1e643000, 0x1e661000, 0x1e663000, 0x1e681000,
0x1e683000, 0x1e6a1000, 0x1e6a3000, 0x1e6c1000,
@@ -0,0 +1,87 @@
/*
* Copyright (c) 2021, Huawei Technologies Co. Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package compiler.vectorapi;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.VectorSpecies;
import jdk.incubator.vector.VectorShuffle;

import org.testng.Assert;
import org.testng.annotations.Test;


/*
* @test
* @bug 8266720
* @modules jdk.incubator.vector
* @run testng/othervm compiler.vectorapi.TestVectorShuffleIotaByte1024
Copy link
Member

@PaulSandoz PaulSandoz May 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perhaps the test can be annotated, declaring it should only execute on ARM/SVE platforms. See the use of the @requires clause used in other JDK tests.

Copy link
Collaborator Author

@Wanghuang-Huawei Wanghuang-Huawei May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for you review. I think this test is used for any arch which has ByteVector.SPECIES_MAX == 1024.

Copy link
Member

@PaulSandoz PaulSandoz May 13, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But we know which arches don't support, x86, PPC etc.

I am unsure why existing shuffle tests do not catch this problem. In fact i would prefer we focus on that if we can rather than adding a specific test. Would you mind looking to see if see if we can expand on the existing shuffleTest?

Copy link
Collaborator Author

@Wanghuang-Huawei Wanghuang-Huawei Jun 4, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • Q : Why existing shuffle tests do not catch this problem?
  • A: Because we need vector_length >= 1024. However, in x86 we don't have this env because the longest register of x86 is 512 in AVX512.

*/

@Test
public class TestVectorShuffleIotaByte1024 {
static final VectorSpecies<Byte> SPECIESb_1024 = ByteVector.SPECIES_MAX;

static final int INVOC_COUNT = Integer.getInteger("jdk.incubator.vector.test.loop-iterations", 50000);

static final byte[] ab_1024 = {50, 49, 47, 53, 47, 49, 50, 48, 50, 32, 46, 116, 105, 32, 115,
110, 101, 104, 116, 103, 110, 101, 114, 116, 115, 32, 101,
99, 110, 101, 115, 101, 114, 112, 44, 101, 118, 111, 108,
32, 115, 110, 101, 112, 114, 97, 104, 115, 32, 101, 99, 110,
101, 115, 98, 65, 46, 117, 111, 121, 32, 101, 118, 111, 108,
32, 73, 46, 103, 110, 97, 117, 72, 32, 71, 78, 65, 87, 45, 45,
33, 117, 111, 121, 32, 103, 110, 105, 115, 115, 105, 77, 46, 117,
111, 121, 32, 111, 116, 32, 114, 101, 116, 116, 101, 108, 32,
104, 116, 52, 32, 121, 109, 32, 115, 105, 32, 115, 105, 104, 116,
44, 121, 116, 101, 101, 119, 83};

static final byte[] expected_1024 = {0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48,
51, 54, 57, 60, 63, 66, 69, 72, 75, 78, 81, 84, 87, 90, 93, 96,
99, 102, 105, 108, 111, 114, 117, 120, 123, 126, -127, -124,
-121, -118, -115, -112, -109, -106, -103, -100, -97, -94, -91,
-88, -85, -82, -79, -76, -73, -70, -67, -64, -61, -58, -55, -52,
-49, -46, -43, -40, -37, -34, -31, -28, -25, -22, -19, -16, -13,
-10, -7, -4, -1, -126, -123, -120, -117, -114, -111, -108, -105,
-102, -99, -96, -93, -90, -87, -84, -81, -78, -75, -72, -69, -66,
-63, -60, -57, -54, -51, -48, -45, -42, -39, -36, -33, -30, -27,
-24, -21, -18, -15, -12, -9, -6, -3};

static void testShuffleIota_1024() {
ByteVector bv = (ByteVector) VectorShuffle.iota(SPECIESb_1024, 0, 3, false).toVector();
bv4.intoArray(ab_1024, 0);
}

static void testIota_1024() {
for (int ic = 0; ic < INVOC_COUNT; ic++) {
testShuffleIota_1024();
}
Assert.assertEquals(ab_1024, expected_1024);
}

@Test
static void testIota() {
if (SPECESb_1024.length() == 1024) {
testIota_1024();
}
}
}