Skip to content
Permalink
Browse files
8252847: Optimize primitive arrayCopy stubs using AVX-512 masked inst…
…ructions

Reviewed-by: neliasso, kvn
  • Loading branch information
Jatin Bhateja committed Oct 10, 2020
1 parent ec41046 commit 4b5ac3abacee0a4b06a9ed0ea57377ff903a90c3
Show file tree
Hide file tree
Showing 11 changed files with 1,448 additions and 98 deletions.
@@ -2589,6 +2589,38 @@ void Assembler::evmovdqub(XMMRegister dst, KRegister mask, Address src, int vect
emit_operand(dst, src);
}

void Assembler::evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(src, 0, dst->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x6F);
emit_operand(dst, src);
}

void Assembler::evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type) {
assert(VM_Version::supports_avx512vlbw(), "");
assert(src != xnoreg, "sanity");
assert(type == T_BYTE || type == T_SHORT || type == T_CHAR || type == T_INT || type == T_LONG, "");
InstructionMark im(this);
bool wide = type == T_SHORT || type == T_CHAR || type == T_LONG;
int prefix = (type == T_BYTE || type == T_SHORT || type == T_CHAR) ? VEX_SIMD_F2 : VEX_SIMD_F3;
InstructionAttr attributes(vector_len, /* vex_w */ wide, /* legacy_mode */ false, /* no_mask_reg */ false, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_FVM, /* input_size_in_bits */ EVEX_NObit);
attributes.reset_is_clear_context();
attributes.set_embedded_opmask_register_specifier(mask);
attributes.set_is_evex_instruction();
vex_prefix(dst, 0, src->encoding(), (Assembler::VexSimdPrefix)prefix, VEX_OPCODE_0F, &attributes);
emit_int8(0x7F);
emit_operand(src, dst);
}

void Assembler::evmovdquw(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_evex(), "");
InstructionMark im(this);
@@ -7803,6 +7835,13 @@ void Assembler::shlxq(Register dst, Register src1, Register src2) {
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}

void Assembler::shrxq(Register dst, Register src1, Register src2) {
assert(VM_Version::supports_bmi2(), "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ true, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), src2->encoding(), src1->encoding(), VEX_SIMD_F2, VEX_OPCODE_0F_38, &attributes);
emit_int16((unsigned char)0xF7, (0xC0 | encode));
}

#ifndef _LP64

void Assembler::incl(Register dst) {
@@ -794,7 +794,6 @@ class Assembler : public AbstractAssembler {

void decl(Register dst);
void decl(Address dst);
void decq(Register dst);
void decq(Address dst);

void incl(Register dst);
@@ -879,6 +878,7 @@ class Assembler : public AbstractAssembler {
void popa_uncached();
#endif
void vzeroupper_uncached();
void decq(Register dst);

void pusha();
void popa();
@@ -1487,6 +1487,10 @@ class Assembler : public AbstractAssembler {
void evmovdquq(XMMRegister dst, Address src, int vector_len);
void evmovdquq(XMMRegister dst, XMMRegister src, int vector_len);

// Generic move instructions.
void evmovdqu(Address dst, KRegister mask, XMMRegister src, int vector_len, int type);
void evmovdqu(XMMRegister dst, KRegister mask, Address src, int vector_len, int type);

// Move lower 64bit to high 64bit in 128bit register
void movlhps(XMMRegister dst, XMMRegister src);

@@ -1989,6 +1993,8 @@ class Assembler : public AbstractAssembler {

void shlxl(Register dst, Register src1, Register src2);
void shlxq(Register dst, Register src1, Register src2);
void shrxq(Register dst, Register src1, Register src2);


//====================VECTOR ARITHMETIC=====================================

@@ -7964,6 +7964,7 @@ void MacroAssembler::cache_wbsync(bool is_pre)
sfence();
}
}

#endif // _LP64

Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) {
@@ -1037,6 +1037,18 @@ class MacroAssembler: public Assembler {
Register rax, Register rcx, Register rdx, Register tmp);
#endif

#ifdef _LP64
void arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift,
Register index, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);

void arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit);
#endif

private:

// these are private because users should be doing movflt/movdbl
@@ -1725,6 +1737,23 @@ class MacroAssembler: public Assembler {

void cache_wb(Address line);
void cache_wbsync(bool is_pre);

void copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false);

void copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift = Address::times_1, int offset = 0);

void copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
int shift = Address::times_1, int offset = 0);

void copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
bool conjoint, int shift = Address::times_1, int offset = 0,
bool use64byteVector = false);

#endif // _LP64

void vallones(XMMRegister dst, int vector_len);
@@ -0,0 +1,249 @@
/*
* Copyright (c) 2020, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#include "precompiled.hpp"
#include "asm/macroAssembler.hpp"
#include "asm/macroAssembler.inline.hpp"

#ifdef PRODUCT
#define BLOCK_COMMENT(str) /* nothing */
#else
#define BLOCK_COMMENT(str) block_comment(str)
#endif

#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")

#ifdef _LP64

void MacroAssembler::arraycopy_avx3_special_cases(XMMRegister xmm, KRegister mask, Register from,
Register to, Register count, int shift,
Register index, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit) {
Label L_entry_64, L_entry_96, L_entry_128;
Label L_entry_160, L_entry_192;

int size_mat[][6] = {
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
};

// Case A) Special case for length less than equal to 32 bytes.
cmpq(count, size_mat[shift][0]);
jccb(Assembler::greater, L_entry_64);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift);
jmp(L_exit);

// Case B) Special case for length less than equal to 64 bytes.
BIND(L_entry_64);
cmpq(count, size_mat[shift][1]);
jccb(Assembler::greater, L_entry_96);
copy64_masked_avx(to, from, xmm, mask, count, index, temp, shift, 0, use64byteVector);
jmp(L_exit);

// Case C) Special case for length less than equal to 96 bytes.
BIND(L_entry_96);
cmpq(count, size_mat[shift][2]);
jccb(Assembler::greater, L_entry_128);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
subq(count, 64 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 64);
jmp(L_exit);

// Case D) Special case for length less than equal to 128 bytes.
BIND(L_entry_128);
cmpq(count, size_mat[shift][3]);
jccb(Assembler::greater, L_entry_160);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy32_avx(to, from, index, xmm, shift, 64);
subq(count, 96 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 96);
jmp(L_exit);

// Case E) Special case for length less than equal to 160 bytes.
BIND(L_entry_160);
cmpq(count, size_mat[shift][4]);
jccb(Assembler::greater, L_entry_192);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
subq(count, 128 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 128);
jmp(L_exit);

// Case F) Special case for length less than equal to 192 bytes.
BIND(L_entry_192);
cmpq(count, size_mat[shift][5]);
jcc(Assembler::greater, L_entry);
copy64_avx(to, from, index, xmm, false, shift, 0, use64byteVector);
copy64_avx(to, from, index, xmm, false, shift, 64, use64byteVector);
copy32_avx(to, from, index, xmm, shift, 128);
subq(count, 160 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, index, temp, shift, 160);
jmp(L_exit);
}

void MacroAssembler::arraycopy_avx3_special_cases_conjoint(XMMRegister xmm, KRegister mask, Register from,
Register to, Register start_index, Register end_index,
Register count, int shift, Register temp,
bool use64byteVector, Label& L_entry, Label& L_exit) {
Label L_entry_64, L_entry_96, L_entry_128;
Label L_entry_160, L_entry_192;
bool avx3 = MaxVectorSize > 32 && AVX3Threshold == 0;

int size_mat[][6] = {
/* T_BYTE */ {32 , 64, 96 , 128 , 160 , 192 },
/* T_SHORT*/ {16 , 32, 48 , 64 , 80 , 96 },
/* T_INT */ {8 , 16, 24 , 32 , 40 , 48 },
/* T_LONG */ {4 , 8, 12 , 16 , 20 , 24 }
};

// Case A) Special case for length less than equal to 32 bytes.
cmpq(count, size_mat[shift][0]);
jccb(Assembler::greater, L_entry_64);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);

// Case B) Special case for length less than equal to 64 bytes.
BIND(L_entry_64);
cmpq(count, size_mat[shift][1]);
jccb(Assembler::greater, L_entry_96);
if (avx3) {
copy64_masked_avx(to, from, xmm, mask, count, start_index, temp, shift, 0, true);
} else {
copy32_avx(to, from, end_index, xmm, shift, -32);
subq(count, 32 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
}
jmp(L_exit);

// Case C) Special case for length less than equal to 96 bytes.
BIND(L_entry_96);
cmpq(count, size_mat[shift][2]);
jccb(Assembler::greater, L_entry_128);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
subq(count, 64 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);

// Case D) Special case for length less than equal to 128 bytes.
BIND(L_entry_128);
cmpq(count, size_mat[shift][3]);
jccb(Assembler::greater, L_entry_160);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy32_avx(to, from, end_index, xmm, shift, -96);
subq(count, 96 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);

// Case E) Special case for length less than equal to 160 bytes.
BIND(L_entry_160);
cmpq(count, size_mat[shift][4]);
jccb(Assembler::greater, L_entry_192);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
subq(count, 128 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);

// Case F) Special case for length less than equal to 192 bytes.
BIND(L_entry_192);
cmpq(count, size_mat[shift][5]);
jcc(Assembler::greater, L_entry);
copy64_avx(to, from, end_index, xmm, true, shift, -64, use64byteVector);
copy64_avx(to, from, end_index, xmm, true, shift, -128, use64byteVector);
copy32_avx(to, from, end_index, xmm, shift, -160);
subq(count, 160 >> shift);
copy32_masked_avx(to, from, xmm, mask, count, start_index, temp, shift);
jmp(L_exit);
}

void MacroAssembler::copy64_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift, int offset,
bool use64byteVector) {
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
assert(MaxVectorSize >= 32, "vector length should be >= 32");
if (!use64byteVector) {
copy32_avx(dst, src, index, xmm, shift, offset);
subptr(length, 32 >> shift);
copy32_masked_avx(dst, src, xmm, mask, length, index, temp, shift, offset+32);
} else {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
assert(MaxVectorSize == 64, "vector length != 64");
negptr(length);
addq(length, 64);
mov64(temp, -1);
shrxq(temp, temp, length);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_512bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_512bit, type[shift]);
}
}


void MacroAssembler::copy32_masked_avx(Register dst, Register src, XMMRegister xmm,
KRegister mask, Register length, Register index,
Register temp, int shift, int offset) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
BasicType type[] = { T_BYTE, T_SHORT, T_INT, T_LONG};
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
mov64(temp, 1);
shlxq(temp, temp, length);
decq(temp);
kmovql(mask, temp);
evmovdqu(xmm, mask, Address(src, index, scale, offset), Assembler::AVX_256bit, type[shift]);
evmovdqu(Address(dst, index, scale, offset), mask, xmm, Assembler::AVX_256bit, type[shift]);
}


void MacroAssembler::copy32_avx(Register dst, Register src, Register index, XMMRegister xmm,
int shift, int offset) {
assert(MaxVectorSize >= 32, "vector length should be >= 32");
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
vmovdqu(xmm, Address(src, index, scale, offset));
vmovdqu(Address(dst, index, scale, offset), xmm);
}


void MacroAssembler::copy64_avx(Register dst, Register src, Register index, XMMRegister xmm,
bool conjoint, int shift, int offset, bool use64byteVector) {
assert(MaxVectorSize == 64 || MaxVectorSize == 32, "vector length mismatch");
if (!use64byteVector) {
if (conjoint) {
copy32_avx(dst, src, index, xmm, shift, offset+32);
copy32_avx(dst, src, index, xmm, shift, offset);
} else {
copy32_avx(dst, src, index, xmm, shift, offset);
copy32_avx(dst, src, index, xmm, shift, offset+32);
}
} else {
Address::ScaleFactor scale = (Address::ScaleFactor)(shift);
evmovdquq(xmm, Address(src, index, scale, offset), Assembler::AVX_512bit);
evmovdquq(Address(dst, index, scale, offset), xmm, Assembler::AVX_512bit);
}
}

#endif

1 comment on commit 4b5ac3a

@bridgekeeper
Copy link

@bridgekeeper bridgekeeper bot commented on 4b5ac3a Oct 10, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review

Issues

Please sign in to comment.