Skip to content

Commit

Permalink
8259846: [BACKOUT] JDK-8259278 Optimize Vector API slice and unslice …
Browse files Browse the repository at this point in the history
…operations

Reviewed-by: vlivanov, psandoz
  • Loading branch information
Daniel D. Daugherty committed Jan 15, 2021
1 parent eb7fa00 commit b78cd63
Show file tree
Hide file tree
Showing 45 changed files with 717 additions and 460 deletions.
10 changes: 0 additions & 10 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Expand Up @@ -3006,16 +3006,6 @@ void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src
}
}

void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
assert(UseAVX > 0, "requires some form of AVX");
if (reachable(src)) {
Assembler::vpaddb(dst, nds, as_Address(src), vector_len);
} else {
lea(rscratch, src);
Assembler::vpaddb(dst, nds, Address(rscratch, 0), vector_len);
}
}

void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) {
assert(UseAVX > 0, "requires some form of AVX");
if (reachable(src)) {
Expand Down
3 changes: 1 addition & 2 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -1241,7 +1241,6 @@ class MacroAssembler: public Assembler {

void vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpaddb(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch);

void vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
Expand Down
18 changes: 1 addition & 17 deletions src/hotspot/cpu/x86/stubGenerator_x86_32.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1999, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1999, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -610,21 +610,6 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

address generate_vector_byte_shuffle_mask(const char *stub_name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
__ emit_data(0x70707070, relocInfo::none, 0);
__ emit_data(0x70707070, relocInfo::none, 0);
__ emit_data(0x70707070, relocInfo::none, 0);
__ emit_data(0x70707070, relocInfo::none, 0);
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
__ emit_data(0xF0F0F0F0, relocInfo::none, 0);
return start;
}

address generate_vector_mask_long_double(const char *stub_name, int32_t maskhi, int32_t masklo) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
Expand Down Expand Up @@ -3981,7 +3966,6 @@ class StubGenerator: public StubCodeGenerator {
StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x03020100);
StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x01000100);
StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask_long_double("vector_long_shuffle_mask", 0x00000001, 0x0);
StubRoutines::x86::_vector_byte_perm_mask = generate_vector_byte_perm_mask("vector_byte_perm_mask");
Expand Down
12 changes: 0 additions & 12 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Expand Up @@ -808,17 +808,6 @@ class StubGenerator: public StubCodeGenerator {
return start;
}

address generate_vector_byte_shuffle_mask(const char *stub_name) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
__ emit_data64(0x7070707070707070, relocInfo::none);
__ emit_data64(0x7070707070707070, relocInfo::none);
__ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
__ emit_data64(0xF0F0F0F0F0F0F0F0, relocInfo::none);
return start;
}

address generate_fp_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
Expand Down Expand Up @@ -6839,7 +6828,6 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::x86::_vector_64_bit_mask = generate_vector_custom_i32("vector_64_bit_mask", Assembler::AVX_512bit,
0xFFFFFFFF, 0xFFFFFFFF, 0, 0);
StubRoutines::x86::_vector_int_shuffle_mask = generate_vector_mask("vector_int_shuffle_mask", 0x0302010003020100);
StubRoutines::x86::_vector_byte_shuffle_mask = generate_vector_byte_shuffle_mask("vector_byte_shuffle_mask");
StubRoutines::x86::_vector_short_shuffle_mask = generate_vector_mask("vector_short_shuffle_mask", 0x0100010001000100);
StubRoutines::x86::_vector_long_shuffle_mask = generate_vector_mask("vector_long_shuffle_mask", 0x0000000100000000);
StubRoutines::x86::_vector_long_sign_mask = generate_vector_mask("vector_long_sign_mask", 0x8000000000000000);
Expand Down
3 changes: 1 addition & 2 deletions src/hotspot/cpu/x86/stubRoutines_x86.cpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2018, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -47,7 +47,6 @@ address StubRoutines::x86::_vector_short_to_byte_mask = NULL;
address StubRoutines::x86::_vector_int_to_byte_mask = NULL;
address StubRoutines::x86::_vector_int_to_short_mask = NULL;
address StubRoutines::x86::_vector_all_bits_set = NULL;
address StubRoutines::x86::_vector_byte_shuffle_mask = NULL;
address StubRoutines::x86::_vector_short_shuffle_mask = NULL;
address StubRoutines::x86::_vector_int_shuffle_mask = NULL;
address StubRoutines::x86::_vector_long_shuffle_mask = NULL;
Expand Down
7 changes: 1 addition & 6 deletions src/hotspot/cpu/x86/stubRoutines_x86.hpp
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -149,7 +149,6 @@ class x86 {
static address _vector_32_bit_mask;
static address _vector_64_bit_mask;
static address _vector_int_shuffle_mask;
static address _vector_byte_shuffle_mask;
static address _vector_short_shuffle_mask;
static address _vector_long_shuffle_mask;
static address _vector_iota_indices;
Expand Down Expand Up @@ -281,10 +280,6 @@ class x86 {
return _vector_int_shuffle_mask;
}

static address vector_byte_shuffle_mask() {
return _vector_byte_shuffle_mask;
}

static address vector_short_shuffle_mask() {
return _vector_short_shuffle_mask;
}
Expand Down
84 changes: 18 additions & 66 deletions src/hotspot/cpu/x86/x86.ad
@@ -1,5 +1,5 @@
//
// Copyright (c) 2011, 2021, Oracle and/or its affiliates. All rights reserved.
// Copyright (c) 2011, 2020, Oracle and/or its affiliates. All rights reserved.
// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
//
// This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -1354,7 +1354,6 @@ Assembler::Width widthForType(BasicType bt) {
static address vector_long_sign_mask() { return StubRoutines::x86::vector_long_sign_mask(); }
static address vector_all_bits_set() { return StubRoutines::x86::vector_all_bits_set(); }
static address vector_int_to_short_mask() { return StubRoutines::x86::vector_int_to_short_mask(); }
static address vector_byte_shufflemask() { return StubRoutines::x86::vector_byte_shuffle_mask(); }
static address vector_short_shufflemask() { return StubRoutines::x86::vector_short_shuffle_mask(); }
static address vector_int_shufflemask() { return StubRoutines::x86::vector_int_shuffle_mask(); }
static address vector_long_shufflemask() { return StubRoutines::x86::vector_long_shuffle_mask(); }
Expand Down Expand Up @@ -1692,9 +1691,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
return false; // Implementation limitation due to how shuffle is loaded
} else if (size_in_bits == 256 && UseAVX < 2) {
return false; // Implementation limitation
} else if (bt == T_BYTE && size_in_bits > 256 && !VM_Version::supports_avx512_vbmi()) {
} else if (bt == T_BYTE && size_in_bits >= 256 && !VM_Version::supports_avx512_vbmi()) {
return false; // Implementation limitation
} else if (bt == T_SHORT && size_in_bits > 256 && !VM_Version::supports_avx512bw()) {
} else if (bt == T_SHORT && size_in_bits >= 256 && !VM_Version::supports_avx512bw()) {
return false; // Implementation limitation
}
break;
Expand Down Expand Up @@ -7501,24 +7500,13 @@ instruct rearrangeB(vec dst, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct rearrangeB_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
instruct rearrangeB_avx(vec dst, vec src, vec shuffle) %{
predicate(vector_element_basic_type(n) == T_BYTE &&
vector_length(n) == 32 && !VM_Version::supports_avx512_vbmi());
match(Set dst (VectorRearrange src shuffle));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
format %{ "vector_rearrange $dst, $shuffle, $src" %}
ins_encode %{
assert(UseAVX >= 2, "required");
// Swap src into vtmp1
__ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
// Shuffle swapped src to get entries from other 128 bit lane
__ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
// Shuffle original src to get entries from self 128 bit lane
__ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
// Create a blend mask by setting high bits for entries coming from other lane in shuffle
__ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
// Perform the blend
__ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
__ vpshufb($dst$$XMMRegister, $shuffle$$XMMRegister, $src$$XMMRegister, Assembler::AVX_256bit);
%}
ins_pipe( pipe_slow );
%}
Expand All @@ -7539,40 +7527,26 @@ instruct rearrangeB_evex(vec dst, vec src, vec shuffle) %{

instruct loadShuffleS(vec dst, vec src, vec vtmp, rRegP scratch) %{
predicate(vector_element_basic_type(n) == T_SHORT &&
vector_length(n) <= 16 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
vector_length(n) <= 8 && !VM_Version::supports_avx512bw()); // NB! aligned with rearrangeS
match(Set dst (VectorLoadShuffle src));
effect(TEMP dst, TEMP vtmp, TEMP scratch);
format %{ "vector_load_shuffle $dst, $src\t! using $vtmp and $scratch as TEMP" %}
ins_encode %{
// Create a byte shuffle mask from short shuffle mask
// only byte shuffle instruction available on these platforms
int vlen_in_bytes = vector_length_in_bytes(this);
if (vlen_in_bytes <= 8) {
// Multiply each shuffle by two to get byte index
__ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
__ psllw($vtmp$$XMMRegister, 1);

// Duplicate to create 2 copies of byte index
__ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
__ psllw($dst$$XMMRegister, 8);
__ por($dst$$XMMRegister, $vtmp$$XMMRegister);

// Add one to get alternate byte index
__ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
__ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
} else {
int vlen_enc = vector_length_encoding(this);
// Multiply each shuffle by two to get byte index
__ vpmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister, vlen_enc);
__ vpsllw($vtmp$$XMMRegister, $vtmp$$XMMRegister, 1, vlen_enc);

// Duplicate to create 2 copies of byte index
__ vpsllw($dst$$XMMRegister, $vtmp$$XMMRegister, 8, vlen_enc);
__ vpor($dst$$XMMRegister, $dst$$XMMRegister, $vtmp$$XMMRegister, vlen_enc);
// Multiply each shuffle by two to get byte index
__ pmovzxbw($vtmp$$XMMRegister, $src$$XMMRegister);
__ psllw($vtmp$$XMMRegister, 1);

// Add one to get alternate byte index
__ vpaddb($dst$$XMMRegister, $dst$$XMMRegister, ExternalAddress(vector_short_shufflemask()), vlen_enc, $scratch$$Register);
}
// Duplicate to create 2 copies of byte index
__ movdqu($dst$$XMMRegister, $vtmp$$XMMRegister);
__ psllw($dst$$XMMRegister, 8);
__ por($dst$$XMMRegister, $vtmp$$XMMRegister);

// Add one to get alternate byte index
__ movdqu($vtmp$$XMMRegister, ExternalAddress(vector_short_shufflemask()), $scratch$$Register);
__ paddb($dst$$XMMRegister, $vtmp$$XMMRegister);
%}
ins_pipe( pipe_slow );
%}
Expand All @@ -7589,28 +7563,6 @@ instruct rearrangeS(vec dst, vec shuffle) %{
ins_pipe( pipe_slow );
%}

instruct rearrangeS_avx(legVec dst, legVec src, vec shuffle, legVec vtmp1, legVec vtmp2, rRegP scratch) %{
predicate(vector_element_basic_type(n) == T_SHORT &&
vector_length(n) == 16 && !VM_Version::supports_avx512bw());
match(Set dst (VectorRearrange src shuffle));
effect(TEMP dst, TEMP vtmp1, TEMP vtmp2, TEMP scratch);
format %{ "vector_rearrange $dst, $shuffle, $src\t! using $vtmp1, $vtmp2, $scratch as TEMP" %}
ins_encode %{
assert(UseAVX >= 2, "required");
// Swap src into vtmp1
__ vperm2i128($vtmp1$$XMMRegister, $src$$XMMRegister, $src$$XMMRegister, 1);
// Shuffle swapped src to get entries from other 128 bit lane
__ vpshufb($vtmp1$$XMMRegister, $vtmp1$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
// Shuffle original src to get entries from self 128 bit lane
__ vpshufb($dst$$XMMRegister, $src$$XMMRegister, $shuffle$$XMMRegister, Assembler::AVX_256bit);
// Create a blend mask by setting high bits for entries coming from other lane in shuffle
__ vpaddb($vtmp2$$XMMRegister, $shuffle$$XMMRegister, ExternalAddress(vector_byte_shufflemask()), Assembler::AVX_256bit, $scratch$$Register);
// Perform the blend
__ vpblendvb($dst$$XMMRegister, $dst$$XMMRegister, $vtmp1$$XMMRegister, $vtmp2$$XMMRegister, Assembler::AVX_256bit);
%}
ins_pipe( pipe_slow );
%}

instruct loadShuffleS_evex(vec dst, vec src) %{
predicate(vector_element_basic_type(n) == T_SHORT &&
VM_Version::supports_avx512bw());
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -387,7 +387,14 @@ public Byte128Vector slice(int origin, Vector<Byte> v) {
@Override
@ForceInline
public Byte128Vector slice(int origin) {
return (Byte128Vector) super.sliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte128Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
Iota = iotaShuffle(origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand All @@ -408,7 +415,14 @@ public Byte128Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<By
@Override
@ForceInline
public Byte128Vector unslice(int origin) {
return (Byte128Vector) super.unsliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte128Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
Iota = iotaShuffle(-origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -387,7 +387,14 @@ public Byte256Vector slice(int origin, Vector<Byte> v) {
@Override
@ForceInline
public Byte256Vector slice(int origin) {
return (Byte256Vector) super.sliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte256Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
Iota = iotaShuffle(origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand All @@ -408,7 +415,14 @@ public Byte256Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<By
@Override
@ForceInline
public Byte256Vector unslice(int origin) {
return (Byte256Vector) super.unsliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte256Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
Iota = iotaShuffle(-origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand Down
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2017, 2021, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2017, 2020, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -387,7 +387,14 @@ public Byte512Vector slice(int origin, Vector<Byte> v) {
@Override
@ForceInline
public Byte512Vector slice(int origin) {
return (Byte512Vector) super.sliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte512Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.LT, (broadcast((byte)(VLENGTH-origin))));
Iota = iotaShuffle(origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand All @@ -408,7 +415,14 @@ public Byte512Vector unslice(int origin, Vector<Byte> w, int part, VectorMask<By
@Override
@ForceInline
public Byte512Vector unslice(int origin) {
return (Byte512Vector) super.unsliceTemplate(origin); // specialize
if ((origin < 0) || (origin >= VLENGTH)) {
throw new ArrayIndexOutOfBoundsException("Index " + origin + " out of bounds for vector length " + VLENGTH);
} else {
Byte512Shuffle Iota = iotaShuffle();
VectorMask<Byte> BlendMask = Iota.toVector().compare(VectorOperators.GE, (broadcast((byte)(origin))));
Iota = iotaShuffle(-origin, 1, true);
return ZERO.blend(this.rearrange(Iota), BlendMask);
}
}

@Override
Expand Down

1 comment on commit b78cd63

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.