Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8265956: JVM crashes when matching LShiftVB Node #3747

Closed
wants to merge 7 commits into from
Closed
Changes from 1 commit
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
@@ -5289,6 +5289,7 @@ instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{
predicate(n->as_Vector()->length() == 4 ||
n->as_Vector()->length() == 8);
match(Set dst (LShiftVB src (LShiftCntV shift)));
match(Set dst (LShiftVB src shift));
This conversation was marked as resolved by Wanghuang-Huawei

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 29, 2021
Contributor Outdated

Does it make sense if changing the mid-end codes to make sure the immediate shift is broadcasted to a vector?

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 29, 2021
Contributor Outdated

This could avoid fixing the missing rules for all related architecture.

This comment has been minimized.

@Wanghuang-Huawei

Wanghuang-Huawei Apr 29, 2021
Author Outdated

x86 misses the rule match(Set dst (LShiftVB src (LShiftCntV imm))); . If we choose to fix it by adding IR LShiftCntV, we should add this rule. Anyway, adding a LShiftCntV here may be more clear. I will refact the code.

ins_cost(INSN_COST);
format %{ "shl $dst, $src, $shift\t# vector (8B)" %}
ins_encode %{
@@ -5308,6 +5309,7 @@ instruct vsll8B_imm(vecD dst, vecD src, immI shift) %{
instruct vsll16B_imm(vecX dst, vecX src, immI shift) %{
predicate(n->as_Vector()->length() == 16);
match(Set dst (LShiftVB src (LShiftCntV shift)));
match(Set dst (LShiftVB src shift));
ins_cost(INSN_COST);
format %{ "shl $dst, $src, $shift\t# vector (16B)" %}
ins_encode %{
@@ -1174,6 +1174,17 @@ void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift)
}
}

void C2_MacroAssembler::vshiftw_imm(int opcode, XMMRegister dst, int shift) {
if (opcode == Op_RShiftVB) {
psraw(dst, shift);
} else if (opcode == Op_LShiftVB) {
psllw(dst, shift);
} else {
assert((opcode == Op_URShiftVB),"opcode should be Op_URShiftVB");
psrlw(dst, shift);
}
}

void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) {
switch (opcode) {
case Op_RShiftVB: // fall-through
@@ -1189,6 +1200,17 @@ void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XM
}
}

void C2_MacroAssembler::vshiftw_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len) {
if (opcode == Op_RShiftVB) {
vpsraw(dst, nds, shift, vector_len);
} else if (opcode == Op_LShiftVB) {
vpsllw(dst, nds, shift, vector_len);
} else {
assert((opcode == Op_URShiftVB),"opcode should be Op_URShiftVB");
vpsrlw(dst, nds, shift, vector_len);
}
}

void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) {
switch (opcode) {
case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems
@@ -99,7 +99,9 @@
void vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
void vshiftd_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
void vshiftw(int opcode, XMMRegister dst, XMMRegister shift);
void vshiftw_imm(int opcode, XMMRegister dst, int shift);
void vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
void vshiftw_imm(int opcode, XMMRegister dst, XMMRegister nds, int shift, int vector_len);
void vshiftq(int opcode, XMMRegister dst, XMMRegister shift);
void vshiftq_imm(int opcode, XMMRegister dst, int shift);
void vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc);
@@ -5857,6 +5857,26 @@ instruct vshiftB(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
ins_pipe( pipe_slow );
%}

instruct vshiftB_imm(vec dst, vec src, immI8 shift, vec tmp, rRegI scratch) %{
predicate(vector_length(n) <= 8);
match(Set dst ( LShiftVB src shift));
match(Set dst ( RShiftVB src shift));
match(Set dst (URShiftVB src shift));
effect(TEMP dst, USE src, TEMP tmp, TEMP scratch);
format %{"vshiftB_imm $dst,$src,$shift" %}
ins_encode %{
assert(UseSSE > 3, "required");
int opcode = this->ideal_Opcode();
bool sign = (opcode != Op_URShiftVB);
__ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister);
__ vshiftw_imm(opcode, $tmp$$XMMRegister, $shift$$constant);
__ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
__ pand($dst$$XMMRegister, $tmp$$XMMRegister);
__ packuswb($dst$$XMMRegister, $dst$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
UseAVX <= 1);
@@ -5882,6 +5902,30 @@ instruct vshift16B(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratc
ins_pipe( pipe_slow );
%}

instruct vshift16B_imm(vec dst, vec src, immI8 shift, vec tmp1, vec tmp2, rRegI scratch) %{
predicate(vector_length(n) == 16 && UseAVX <= 1);
match(Set dst ( LShiftVB src shift));
match(Set dst ( RShiftVB src shift));
match(Set dst (URShiftVB src shift));
effect(TEMP dst, USE src, TEMP tmp1, TEMP tmp2, TEMP scratch);
format %{"vshift16B_imm $dst,$src,$shift" %}
ins_encode %{
assert(UseSSE > 3, "required");
int opcode = this->ideal_Opcode();
bool sign = (opcode != Op_URShiftVB);
__ vextendbw(sign, $tmp1$$XMMRegister, $src$$XMMRegister);
__ vshiftw_imm(opcode, $tmp1$$XMMRegister, $shift$$constant);
__ pshufd($tmp2$$XMMRegister, $src$$XMMRegister, 0xE);
__ vextendbw(sign, $tmp2$$XMMRegister, $tmp2$$XMMRegister);
__ vshiftw_imm(opcode, $tmp2$$XMMRegister, $shift$$constant);
__ movdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
__ pand($tmp2$$XMMRegister, $dst$$XMMRegister);
__ pand($dst$$XMMRegister, $tmp1$$XMMRegister);
__ packuswb($dst$$XMMRegister, $tmp2$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}

instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
predicate(vector_length(n) == 16 && VectorNode::is_vshift_cnt(n->in(2)) &&
UseAVX > 1);
@@ -5903,6 +5947,26 @@ instruct vshift16B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
ins_pipe( pipe_slow );
%}

instruct vshift16B_avx_imm(vec dst, vec src, immI8 shift, vec tmp, rRegI scratch) %{
predicate(vector_length(n) == 16 && UseAVX > 1);
match(Set dst ( LShiftVB src shift));
match(Set dst ( RShiftVB src shift));
match(Set dst (URShiftVB src shift));
effect(TEMP dst, TEMP tmp, TEMP scratch);
format %{"vshift16B_avx_imm $dst,$src,$shift" %}
ins_encode %{
int opcode = this->ideal_Opcode();
bool sign = (opcode != Op_URShiftVB);
int vlen_enc = Assembler::AVX_256bit;
__ vextendbw(sign, $tmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vshiftw_imm(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$constant, vlen_enc);
__ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
__ vextracti128_high($dst$$XMMRegister, $tmp$$XMMRegister);
__ vpackuswb($dst$$XMMRegister, $tmp$$XMMRegister, $dst$$XMMRegister, 0);
%}
ins_pipe( pipe_slow );
%}

instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
predicate(vector_length(n) == 32 && VectorNode::is_vshift_cnt(n->in(2)));
match(Set dst ( LShiftVB src shift));
@@ -5928,6 +5992,31 @@ instruct vshift32B_avx(vec dst, vec src, vec shift, vec tmp, rRegI scratch) %{
ins_pipe( pipe_slow );
%}

instruct vshift32B_avx_imm(vec dst, vec src, immI8 shift, vec tmp, rRegI scratch) %{
predicate(vector_length(n) == 32);
match(Set dst ( LShiftVB src shift));
match(Set dst ( RShiftVB src shift));
match(Set dst (URShiftVB src shift));
effect(TEMP dst, TEMP tmp, TEMP scratch);
format %{"vshift32B_avx_imm $dst,$src,$shift" %}
ins_encode %{
assert(UseAVX > 1, "required");
int opcode = this->ideal_Opcode();
bool sign = (opcode != Op_URShiftVB);
int vlen_enc = Assembler::AVX_256bit;
__ vextracti128_high($tmp$$XMMRegister, $src$$XMMRegister);
__ vextendbw(sign, $tmp$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
__ vextendbw(sign, $dst$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vshiftw_imm(opcode, $tmp$$XMMRegister, $tmp$$XMMRegister, $shift$$constant, vlen_enc);
__ vshiftw_imm(opcode, $dst$$XMMRegister, $dst$$XMMRegister, $shift$$constant, vlen_enc);
__ vpand($tmp$$XMMRegister, $tmp$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
__ vpand($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), vlen_enc, $scratch$$Register);
__ vpackuswb($dst$$XMMRegister, $dst$$XMMRegister, $tmp$$XMMRegister, vlen_enc);
__ vpermq($dst$$XMMRegister, $dst$$XMMRegister, 0xD8, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI scratch) %{
predicate(vector_length(n) == 64 && VectorNode::is_vshift_cnt(n->in(2)));
match(Set dst ( LShiftVB src shift));
@@ -5956,6 +6045,34 @@ instruct vshift64B_avx(vec dst, vec src, vec shift, vec tmp1, vec tmp2, rRegI sc
ins_pipe( pipe_slow );
%}

instruct vshift64B_avx_imm(vec dst, vec src, immI8 shift, vec tmp1, vec tmp2, rRegI scratch) %{
predicate(vector_length(n) == 64);
match(Set dst ( LShiftVB src shift));
match(Set dst ( RShiftVB src shift));
match(Set dst (URShiftVB src shift));
effect(TEMP dst, TEMP tmp1, TEMP tmp2, TEMP scratch);
format %{"vshift64B_avx_imm $dst,$src,$shift" %}
ins_encode %{
assert(UseAVX > 2, "required");
int opcode = this->ideal_Opcode();
bool sign = (opcode != Op_URShiftVB);
int vlen_enc = Assembler::AVX_512bit;
__ vextracti64x4($tmp1$$XMMRegister, $src$$XMMRegister, 1);
__ vextendbw(sign, $tmp1$$XMMRegister, $tmp1$$XMMRegister, vlen_enc);
__ vextendbw(sign, $tmp2$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vshiftw_imm(opcode, $tmp1$$XMMRegister, $tmp1$$XMMRegister, $shift$$constant, vlen_enc);
__ vshiftw_imm(opcode, $tmp2$$XMMRegister, $tmp2$$XMMRegister, $shift$$constant, vlen_enc);
__ vmovdqu($dst$$XMMRegister, ExternalAddress(vector_short_to_byte_mask()), $scratch$$Register);
__ vpbroadcastd($dst$$XMMRegister, $dst$$XMMRegister, vlen_enc);
__ vpand($tmp1$$XMMRegister, $tmp1$$XMMRegister, $dst$$XMMRegister, vlen_enc);
__ vpand($tmp2$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
__ vpackuswb($dst$$XMMRegister, $tmp1$$XMMRegister, $tmp2$$XMMRegister, vlen_enc);
__ evmovdquq($tmp2$$XMMRegister, ExternalAddress(vector_byte_perm_mask()), vlen_enc, $scratch$$Register);
__ vpermq($dst$$XMMRegister, $tmp2$$XMMRegister, $dst$$XMMRegister, vlen_enc);
%}
ins_pipe( pipe_slow );
%}

// Shorts vector logical right shift produces incorrect Java result
// for negative data because java code convert short value into int with
// sign extension before a shift. But char vectors are fine since chars are
@@ -0,0 +1,59 @@
/*
* Copyright (c) 2021, Huawei Technologies Co. Ltd. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package compiler.vectorapi;

import jdk.incubator.vector.ShortVector;
import jdk.incubator.vector.VectorSpecies;
import jdk.incubator.vector.VectorShuffle;

/*
* @test
* @bug 8265956
* @modules jdk.incubator.vector
* @run main/othervm compiler.vectorapi.TestVectorShuffleIotaShort
*/

public class TestVectorShuffleIotaShort {
static final VectorSpecies<Short> SPECIESs = ShortVector.SPECIES_128;

static final int INVOC_COUNT = 50000;

static short[] as = {87, 65, 78, 71, 72, 69, 82, 69};

public static void testShuffleS() {
This conversation was marked as resolved by Wanghuang-Huawei

This comment has been minimized.

@DamonFool

DamonFool Apr 29, 2021
Member Outdated

IMO, this test is not enough since the assembly code is not easy to get reviewed.

So I suggest to improve your test:

  1. Test as many match-patterns you added as possible
  2. Verify the results in your tests

Thanks.

This comment has been minimized.

@Wanghuang-Huawei

Wanghuang-Huawei Apr 29, 2021
Author Outdated

Thank you for your review.
1 There is only one match-pattern here. I think that your suggestion is we should test all lengths here? I will add other vector lengths in my next commit.
2 I will add checking code here.

This comment has been minimized.

@Wanghuang-Huawei

Wanghuang-Huawei Apr 29, 2021
Author Outdated

@DamonFool I will add other vector lengths here. Thank you for your review!

ShortVector sv = (ShortVector) VectorShuffle.iota(SPECIESs, 0, 2, false).toVector();
sv.intoArray(as, 0);
}

public static void main(String[] args) {

for (int i = 0; i < INVOC_COUNT; i++) {
testShuffleS();
}
for (int i = 0; i < as.length; i++) {
System.out.print(as[i] + ", ");
}
System.out.println();
}
}