Skip to content

Commit

Permalink
8322768: Optimize non-subword vector compress and expand APIs for AVX…
Browse files Browse the repository at this point in the history
…2 target.

Reviewed-by: epeter, sviswanathan
  • Loading branch information
Jatin Bhateja committed Jan 25, 2024
1 parent 9d1a6d1 commit 6d36eb7
Show file tree
Hide file tree
Showing 10 changed files with 364 additions and 18 deletions.
6 changes: 3 additions & 3 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 1997, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 1997, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -816,8 +816,8 @@ class Assembler : public AbstractAssembler {
void check_relocation(RelocationHolder const& rspec, int format);
#endif

void emit_data(jint data, relocInfo::relocType rtype, int format);
void emit_data(jint data, RelocationHolder const& rspec, int format);
void emit_data(jint data, relocInfo::relocType rtype, int format = 0);
void emit_data(jint data, RelocationHolder const& rspec, int format = 0);
void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);

Expand Down
36 changes: 36 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5282,6 +5282,42 @@ void C2_MacroAssembler::vector_mask_compress(KRegister dst, KRegister src, Regis
kmov(dst, rtmp2);
}

#ifdef _LP64
void C2_MacroAssembler::vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src,
XMMRegister mask, Register rtmp, Register rscratch,
XMMRegister permv, XMMRegister xtmp, BasicType bt,
int vec_enc) {
assert(type2aelembytes(bt) >= 4, "");
assert(opcode == Op_CompressV || opcode == Op_ExpandV, "");
address compress_perm_table = nullptr;
address expand_perm_table = nullptr;
if (type2aelembytes(bt) == 8) {
compress_perm_table = StubRoutines::x86::compress_perm_table64();
expand_perm_table = StubRoutines::x86::expand_perm_table64();
vmovmskpd(rtmp, mask, vec_enc);
} else {
compress_perm_table = StubRoutines::x86::compress_perm_table32();
expand_perm_table = StubRoutines::x86::expand_perm_table32();
vmovmskps(rtmp, mask, vec_enc);
}
shlq(rtmp, 5); // for 32 byte permute row.
if (opcode == Op_CompressV) {
lea(rscratch, ExternalAddress(compress_perm_table));
} else {
lea(rscratch, ExternalAddress(expand_perm_table));
}
addptr(rtmp, rscratch);
vmovdqu(permv, Address(rtmp));
vpermps(dst, permv, src, Assembler::AVX_256bit);
vpxor(xtmp, xtmp, xtmp, vec_enc);
// Blend the result with zero vector using permute mask, each column entry
// in a permute table row contains either a valid permute index or a -1 (default)
// value, this can potentially be used as a blending mask after
// compressing/expanding the source vector lanes.
vblendvps(dst, dst, xtmp, permv, vec_enc, false, permv);
}
#endif

void C2_MacroAssembler::vector_compress_expand(int opcode, XMMRegister dst, XMMRegister src, KRegister mask,
bool merge, BasicType bt, int vec_enc) {
if (opcode == Op_CompressV) {
Expand Down
6 changes: 5 additions & 1 deletion src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2020, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -390,6 +390,10 @@

void vector_round_float_avx(XMMRegister dst, XMMRegister src, AddressLiteral float_sign_flip, AddressLiteral new_mxcsr, int vec_enc,
Register tmp, XMMRegister xtmp1, XMMRegister xtmp2, XMMRegister xtmp3, XMMRegister xtmp4);

void vector_compress_expand_avx2(int opcode, XMMRegister dst, XMMRegister src, XMMRegister mask,
Register rtmp, Register rscratch, XMMRegister permv, XMMRegister xtmp,
BasicType bt, int vec_enc);
#endif // _LP64

void udivI(Register rax, Register divisor, Register rdx);
Expand Down
93 changes: 93 additions & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -951,6 +951,92 @@ address StubGenerator::generate_fp_mask(const char *stub_name, int64_t mask) {
return start;
}

address StubGenerator::generate_compress_perm_table(const char *stub_name, int32_t esize) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
if (esize == 32) {
// Loop to generate 256 x 8 int compression permute index table. A row is
// accessed using 8 bit index computed using vector mask. An entry in
// a row holds either a valid permute index corresponding to set bit position
// or a -1 (default) value.
for (int mask = 0; mask < 256; mask++) {
int ctr = 0;
for (int j = 0; j < 8; j++) {
if (mask & (1 << j)) {
__ emit_data(j, relocInfo::none);
ctr++;
}
}
for (; ctr < 8; ctr++) {
__ emit_data(-1, relocInfo::none);
}
}
} else {
assert(esize == 64, "");
// Loop to generate 16 x 4 long compression permute index table. A row is
// accessed using 4 bit index computed using vector mask. An entry in
// a row holds either a valid permute index pair for a quadword corresponding
// to set bit position or a -1 (default) value.
for (int mask = 0; mask < 16; mask++) {
int ctr = 0;
for (int j = 0; j < 4; j++) {
if (mask & (1 << j)) {
__ emit_data(2 * j, relocInfo::none);
__ emit_data(2 * j + 1, relocInfo::none);
ctr++;
}
}
for (; ctr < 4; ctr++) {
__ emit_data64(-1L, relocInfo::none);
}
}
}
return start;
}

address StubGenerator::generate_expand_perm_table(const char *stub_name, int32_t esize) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
address start = __ pc();
if (esize == 32) {
// Loop to generate 256 x 8 int expand permute index table. A row is accessed
// using 8 bit index computed using vector mask. An entry in a row holds either
// a valid permute index (starting from least significant lane) placed at poisition
// corresponding to set bit position or a -1 (default) value.
for (int mask = 0; mask < 256; mask++) {
int ctr = 0;
for (int j = 0; j < 8; j++) {
if (mask & (1 << j)) {
__ emit_data(ctr++, relocInfo::none);
} else {
__ emit_data(-1, relocInfo::none);
}
}
}
} else {
assert(esize == 64, "");
// Loop to generate 16 x 4 long expand permute index table. A row is accessed
// using 4 bit index computed using vector mask. An entry in a row holds either
// a valid doubleword permute index pair representing a quadword index (starting
// from least significant lane) placed at poisition corresponding to set bit
// position or a -1 (default) value.
for (int mask = 0; mask < 16; mask++) {
int ctr = 0;
for (int j = 0; j < 4; j++) {
if (mask & (1 << j)) {
__ emit_data(2 * ctr, relocInfo::none);
__ emit_data(2 * ctr + 1, relocInfo::none);
ctr++;
} else {
__ emit_data64(-1L, relocInfo::none);
}
}
}
}
return start;
}

address StubGenerator::generate_vector_mask(const char *stub_name, int64_t mask) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", stub_name);
Expand Down Expand Up @@ -4095,6 +4181,13 @@ void StubGenerator::generate_compiler_stubs() {
StubRoutines::x86::_vector_reverse_byte_perm_mask_int = generate_vector_reverse_byte_perm_mask_int("perm_mask_int");
StubRoutines::x86::_vector_reverse_byte_perm_mask_short = generate_vector_reverse_byte_perm_mask_short("perm_mask_short");

if (VM_Version::supports_avx2() && !VM_Version::supports_avx512vl()) {
StubRoutines::x86::_compress_perm_table32 = generate_compress_perm_table("compress_perm_table32", 32);
StubRoutines::x86::_compress_perm_table64 = generate_compress_perm_table("compress_perm_table64", 64);
StubRoutines::x86::_expand_perm_table32 = generate_expand_perm_table("expand_perm_table32", 32);
StubRoutines::x86::_expand_perm_table64 = generate_expand_perm_table("expand_perm_table64", 64);
}

if (VM_Version::supports_avx2() && !VM_Version::supports_avx512_vpopcntdq()) {
// lut implementation influenced by counting 1s algorithm from section 5-1 of Hackers' Delight.
StubRoutines::x86::_vector_popcount_lut = generate_popcount_avx_lut("popcount_lut");
Expand Down
6 changes: 5 additions & 1 deletion src/hotspot/cpu/x86/stubGenerator_x86_64.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2003, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2003, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -99,6 +99,10 @@ class StubGenerator: public StubCodeGenerator {

address generate_fp_mask(const char *stub_name, int64_t mask);

address generate_compress_perm_table(const char *stub_name, int32_t esize);

address generate_expand_perm_table(const char *stub_name, int32_t esize);

address generate_vector_mask(const char *stub_name, int64_t mask);

address generate_vector_byte_perm_mask(const char *stub_name);
Expand Down
6 changes: 5 additions & 1 deletion src/hotspot/cpu/x86/stubRoutines_x86.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -82,6 +82,10 @@ address StubRoutines::x86::_join_0_1_base64 = nullptr;
address StubRoutines::x86::_join_1_2_base64 = nullptr;
address StubRoutines::x86::_join_2_3_base64 = nullptr;
address StubRoutines::x86::_decoding_table_base64 = nullptr;
address StubRoutines::x86::_compress_perm_table32 = nullptr;
address StubRoutines::x86::_compress_perm_table64 = nullptr;
address StubRoutines::x86::_expand_perm_table32 = nullptr;
address StubRoutines::x86::_expand_perm_table64 = nullptr;
#endif
address StubRoutines::x86::_pshuffle_byte_flip_mask_addr = nullptr;

Expand Down
12 changes: 10 additions & 2 deletions src/hotspot/cpu/x86/stubRoutines_x86.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2013, 2023, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013, 2024, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
Expand Down Expand Up @@ -37,7 +37,7 @@ enum platform_dependent_constants {
_continuation_stubs_code_size = 1000 LP64_ONLY(+1000),
// AVX512 intrinsics add more code in 64-bit VM,
// Windows have more code to save/restore registers
_compiler_stubs_code_size = 20000 LP64_ONLY(+32000) WINDOWS_ONLY(+2000),
_compiler_stubs_code_size = 20000 LP64_ONLY(+39000) WINDOWS_ONLY(+2000),
_final_stubs_code_size = 10000 LP64_ONLY(+20000) WINDOWS_ONLY(+2000) ZGC_ONLY(+20000)
};

Expand All @@ -58,6 +58,10 @@ class x86 {
static address _float_sign_flip;
static address _double_sign_mask;
static address _double_sign_flip;
static address _compress_perm_table32;
static address _compress_perm_table64;
static address _expand_perm_table32;
static address _expand_perm_table64;

public:

Expand Down Expand Up @@ -338,6 +342,10 @@ class x86 {
static address base64_decoding_table_addr() { return _decoding_table_base64; }
static address base64_AVX2_decode_tables_addr() { return _avx2_decode_tables_base64; }
static address base64_AVX2_decode_LUT_tables_addr() { return _avx2_decode_lut_tables_base64; }
static address compress_perm_table32() { return _compress_perm_table32; }
static address compress_perm_table64() { return _compress_perm_table64; }
static address expand_perm_table32() { return _expand_perm_table32; }
static address expand_perm_table64() { return _expand_perm_table64; }
#endif
static address pshuffle_byte_flip_mask_addr() { return _pshuffle_byte_flip_mask_addr; }
static address arrays_hashcode_powers_of_31() { return (address)_arrays_hashcode_powers_of_31; }
Expand Down
1 change: 0 additions & 1 deletion src/hotspot/cpu/x86/stubRoutines_x86_64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,3 @@ address StubRoutines::x86::_float_sign_mask = nullptr;
address StubRoutines::x86::_float_sign_flip = nullptr;
address StubRoutines::x86::_double_sign_mask = nullptr;
address StubRoutines::x86::_double_sign_flip = nullptr;

31 changes: 22 additions & 9 deletions src/hotspot/cpu/x86/x86.ad
Original file line number Diff line number Diff line change
Expand Up @@ -1425,6 +1425,8 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CompressV:
case Op_ExpandV:
case Op_PopCountVL:
if (UseAVX < 2) {
return false;
Expand Down Expand Up @@ -1659,12 +1661,6 @@ bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CompressV:
case Op_ExpandV:
if (!VM_Version::supports_avx512vl()) {
return false;
}
break;
case Op_SqrtF:
if (UseSSE < 1) {
return false;
Expand Down Expand Up @@ -1952,13 +1948,12 @@ bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType bt) {
if (is_subword_type(bt) && !VM_Version::supports_avx512_vbmi2()) {
return false;
}
if (size_in_bits < 128 ) {
if (!is_LP64 && !VM_Version::supports_avx512vl() && size_in_bits < 512) {
return false;
}
if (size_in_bits < 512 && !VM_Version::supports_avx512vl()) {
if (size_in_bits < 128 ) {
return false;
}
break;
case Op_VectorLongToMask:
if (UseAVX < 1 || !is_LP64) {
return false;
Expand Down Expand Up @@ -9178,8 +9173,26 @@ instruct vmask_first_or_last_true_avx(rRegI dst, vec mask, immI size, rRegL tmp,
%}

// --------------------------------- Compress/Expand Operations ---------------------------
#ifdef _LP64
instruct vcompress_reg_avx(vec dst, vec src, vec mask, rRegI rtmp, rRegL rscratch, vec perm, vec xtmp, rFlagsReg cr) %{
predicate(!VM_Version::supports_avx512vl() && Matcher::vector_length_in_bytes(n) <= 32);
match(Set dst (CompressV src mask));
match(Set dst (ExpandV src mask));
effect(TEMP_DEF dst, TEMP perm, TEMP xtmp, TEMP rtmp, TEMP rscratch, KILL cr);
format %{ "vector_compress $dst, $src, $mask \t!using $xtmp, $rtmp, $rscratch and $perm as TEMP" %}
ins_encode %{
int opcode = this->ideal_Opcode();
int vlen_enc = vector_length_encoding(this);
BasicType bt = Matcher::vector_element_basic_type(this);
__ vector_compress_expand_avx2(opcode, $dst$$XMMRegister, $src$$XMMRegister, $mask$$XMMRegister, $rtmp$$Register,
$rscratch$$Register, $perm$$XMMRegister, $xtmp$$XMMRegister, bt, vlen_enc);
%}
ins_pipe( pipe_slow );
%}
#endif

instruct vcompress_expand_reg_evex(vec dst, vec src, kReg mask) %{
predicate(VM_Version::supports_avx512vl() || Matcher::vector_length_in_bytes(n) == 64);
match(Set dst (CompressV src mask));
match(Set dst (ExpandV src mask));
format %{ "vector_compress_expand $dst, $src, $mask" %}
Expand Down
Loading

3 comments on commit 6d36eb7

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JesperIRL
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/tag jdk-23+7

@openjdk
Copy link

@openjdk openjdk bot commented on 6d36eb7 Jan 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@JesperIRL The tag jdk-23+7 was successfully created.

Please sign in to comment.