Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
8266332: Adler32 intrinsic for x86 64-bit platforms
Co-authored-by: Xubo Zhang <xubo.zhang@intel.com>
Co-authored-by: Greg B Tucker <greg.b.tucker@intel.com>
Co-authored-by: Pengfei Li <pli@openjdk.org>
Reviewed-by: sviswanathan, jbhateja, kvn, neliasso
  • Loading branch information
3 people authored and Vladimir Kozlov committed May 19, 2021
1 parent b961f25 commit 8e3549f
Show file tree
Hide file tree
Showing 13 changed files with 394 additions and 6 deletions.
12 changes: 12 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Expand Up @@ -8030,6 +8030,18 @@ void Assembler::vbroadcastsd(XMMRegister dst, Address src, int vector_len) {
emit_operand(dst, src);
}

void Assembler::vbroadcastf128(XMMRegister dst, Address src, int vector_len) {
assert(VM_Version::supports_avx(), "");
assert(vector_len == AVX_256bit, "");
assert(dst != xnoreg, "sanity");
InstructionMark im(this);
InstructionAttr attributes(vector_len, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
attributes.set_address_attributes(/* tuple_type */ EVEX_T4, /* input_size_in_bits */ EVEX_32bit);
// swap src<->dst for encoding
vex_prefix(src, 0, dst->encoding(), VEX_SIMD_66, VEX_OPCODE_0F_38, &attributes);
emit_int8(0x1A);
emit_operand(dst, src);
}

// gpr source broadcast forms

Expand Down
3 changes: 2 additions & 1 deletion src/hotspot/cpu/x86/assembler_x86.hpp
Expand Up @@ -2442,11 +2442,12 @@ class Assembler : public AbstractAssembler {
void evbroadcasti64x2(XMMRegister dst, XMMRegister src, int vector_len);
void evbroadcasti64x2(XMMRegister dst, Address src, int vector_len);

// scalar single/double precision replicate
// scalar single/double/128bit precision replicate
void vbroadcastss(XMMRegister dst, XMMRegister src, int vector_len);
void vbroadcastss(XMMRegister dst, Address src, int vector_len);
void vbroadcastsd(XMMRegister dst, XMMRegister src, int vector_len);
void vbroadcastsd(XMMRegister dst, Address src, int vector_len);
void vbroadcastf128(XMMRegister dst, Address src, int vector_len);

// gpr sourced byte/word/dword/qword replicate
void evpbroadcastb(XMMRegister dst, Register src, int vector_len);
Expand Down
10 changes: 10 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Expand Up @@ -3231,6 +3231,16 @@ void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int
Assembler::vpmullw(dst, nds, src, vector_len);
}

void MacroAssembler::vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) {
assert((UseAVX > 0), "AVX support is needed");
if (reachable(src)) {
Assembler::vpmulld(dst, nds, as_Address(src), vector_len);
} else {
lea(scratch_reg, src);
Assembler::vpmulld(dst, nds, Address(scratch_reg, 0), vector_len);
}
}

void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15");
Assembler::vpsubb(dst, nds, src, vector_len);
Expand Down
8 changes: 8 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Expand Up @@ -1307,6 +1307,13 @@ class MacroAssembler: public Assembler {

void vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
void vpmulld(XMMRegister dst, XMMRegister nds, Address src, int vector_len) {
Assembler::vpmulld(dst, nds, src, vector_len);
};
void vpmulld(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) {
Assembler::vpmulld(dst, nds, src, vector_len);
}
void vpmulld(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg);

void vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len);
void vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len);
Expand Down Expand Up @@ -1764,6 +1771,7 @@ class MacroAssembler: public Assembler {
void kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos,
Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop,
Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup);
void updateBytesAdler32(Register adler32, Register buf, Register length, XMMRegister shuf0, XMMRegister shuf1, ExternalAddress scale);
#endif // _LP64

// CRC32C code for java.util.zip.CRC32C::updateBytes() intrinsic
Expand Down
211 changes: 211 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86_adler.cpp
@@ -0,0 +1,211 @@
/*
* Copyright (c) 2021, Intel Corporation.
*
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*
*/

#include "precompiled.hpp"
#include "asm/assembler.hpp"
#include "asm/assembler.inline.hpp"
#include "runtime/stubRoutines.hpp"
#include "macroAssembler_x86.hpp"

#ifdef _LP64
void MacroAssembler::updateBytesAdler32(Register init_d, Register data, Register size, XMMRegister yshuf0, XMMRegister yshuf1, ExternalAddress ascaletab)
{
const int LIMIT = 5552;
const int BASE = 65521;
const int CHUNKSIZE = 16;
const int CHUNKSIZE_M1 = CHUNKSIZE - 1;

const Register s = r11;
const Register a_d = r12; //r12d
const Register b_d = r8; //r8d
const Register end = r13;

const XMMRegister ya = xmm0;
const XMMRegister yb = xmm1;
const XMMRegister ydata0 = xmm2;
const XMMRegister ydata1 = xmm3;
const XMMRegister ysa = xmm4;
const XMMRegister ydata = ysa;
const XMMRegister ytmp0 = ydata0;
const XMMRegister ytmp1 = ydata1;
const XMMRegister ytmp2 = xmm5;
const XMMRegister xa = xmm0;
const XMMRegister xb = xmm1;
const XMMRegister xtmp0 = xmm2;
const XMMRegister xtmp1 = xmm3;
const XMMRegister xsa = xmm4;
const XMMRegister xtmp2 = xmm5;
assert_different_registers(init_d, data, size, s, a_d, b_d, end, rax);

Label SLOOP1, SLOOP1A, SKIP_LOOP_1A, FINISH, LT64, DO_FINAL, FINAL_LOOP, ZERO_SIZE, END;

push(r12);
push(r13);
push(r14);
movl(b_d, init_d); //adler
shrl(b_d, 16);
andl(init_d, 0xFFFF);
cmpl(size, 32);
jcc(Assembler::below, LT64);
movdl(xa, init_d); //vmovd - 32bit
vpxor(yb, yb, yb, Assembler::AVX_256bit);

bind(SLOOP1);
movl(s, LIMIT);
cmpl(s, size);
cmovl(Assembler::above, s, size); // s = min(size, LIMIT)
lea(end, Address(s, data, Address::times_1, -CHUNKSIZE_M1));
cmpptr(data, end);
jcc(Assembler::aboveEqual, SKIP_LOOP_1A);

align(32);
bind(SLOOP1A);
vbroadcastf128(ydata, Address(data, 0), Assembler::AVX_256bit);
addptr(data, CHUNKSIZE);
vpshufb(ydata0, ydata, yshuf0, Assembler::AVX_256bit);
vpaddd(ya, ya, ydata0, Assembler::AVX_256bit);
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
vpshufb(ydata1, ydata, yshuf1, Assembler::AVX_256bit);
vpaddd(ya, ya, ydata1, Assembler::AVX_256bit);
vpaddd(yb, yb, ya, Assembler::AVX_256bit);
cmpptr(data, end);
jcc(Assembler::below, SLOOP1A);

bind(SKIP_LOOP_1A);
addptr(end, CHUNKSIZE_M1);
testl(s, CHUNKSIZE_M1);
jcc(Assembler::notEqual, DO_FINAL);

// either we're done, or we just did LIMIT
subl(size, s);

// reduce
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14);

// compute horizontal sums of ya, yb, ysa
vextracti128(xtmp0, ya, 1);
vextracti128(xtmp1, yb, 1);
vextracti128(xtmp2, ysa, 1);
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);

movdl(rax, xa);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
movl(a_d, rdx);

vpsubd(xb, xb, xsa, Assembler::AVX_128bit);
movdl(rax, xb);
addl(rax, b_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
movl(b_d, rdx);

testl(size, size);
jcc(Assembler::zero, FINISH);

// continue loop
movdl(xa, a_d);
vpxor(yb, yb, yb, Assembler::AVX_256bit);
jmp(SLOOP1);

bind(FINISH);
movl(rax, b_d);
shll(rax, 16);
orl(rax, a_d);
jmp(END);

bind(LT64);
movl(a_d, init_d);
lea(end, Address(data, size, Address::times_1));
testl(size, size);
jcc(Assembler::notZero, FINAL_LOOP);
jmp(ZERO_SIZE);

// handle remaining 1...15 bytes
bind(DO_FINAL);
// reduce
vpslld(yb, yb, 3, Assembler::AVX_256bit); //b is scaled by 8
vpmulld(ysa, ya, ascaletab, Assembler::AVX_256bit, r14); //scaled a

vextracti128(xtmp0, ya, 1);
vextracti128(xtmp1, yb, 1);
vextracti128(xtmp2, ysa, 1);
vpaddd(xa, xa, xtmp0, Assembler::AVX_128bit);
vpaddd(xb, xb, xtmp1, Assembler::AVX_128bit);
vpaddd(xsa, xsa, xtmp2, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vphaddd(xa, xa, xa, Assembler::AVX_128bit);
vphaddd(xb, xb, xb, Assembler::AVX_128bit);
vphaddd(xsa, xsa, xsa, Assembler::AVX_128bit);
vpsubd(xb, xb, xsa, Assembler::AVX_128bit);

movdl(a_d, xa);
movdl(rax, xb);
addl(b_d, rax);

align(32);
bind(FINAL_LOOP);
movzbl(rax, Address(data, 0)); //movzx eax, byte[data]
addl(a_d, rax);
addptr(data, 1);
addl(b_d, a_d);
cmpptr(data, end);
jcc(Assembler::below, FINAL_LOOP);

bind(ZERO_SIZE);

movl(rax, a_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // div ecx -- divide edx:eax by ecx, quot->eax, rem->edx
movl(a_d, rdx);

movl(rax, b_d);
xorl(rdx, rdx);
movl(rcx, BASE);
divl(rcx); // divide edx:eax by ecx, quot->eax, rem->edx
shll(rdx, 16);
orl(rdx, a_d);
movl(rax, rdx);

bind(END);
pop(r14);
pop(r13);
pop(r12);
}
#endif
46 changes: 46 additions & 0 deletions src/hotspot/cpu/x86/stubGenerator_x86_64.cpp
Expand Up @@ -5790,6 +5790,47 @@ address generate_avx_ghash_processBlocks() {
return start;
}


/***
* Arguments:
*
* Inputs:
* c_rarg0 - int adler
* c_rarg1 - byte* buff
* c_rarg2 - int len
*
* Output:
* rax - int adler result
*/

address generate_updateBytesAdler32() {
assert(UseAdler32Intrinsics, "need AVX2");

__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");

address start = __ pc();

const Register data = r9;
const Register size = r10;

const XMMRegister yshuf0 = xmm6;
const XMMRegister yshuf1 = xmm7;
assert_different_registers(c_rarg0, c_rarg1, c_rarg2, data, size);

BLOCK_COMMENT("Entry:");
__ enter(); // required for proper stackwalking of RuntimeStub frame

__ vmovdqu(yshuf0, ExternalAddress((address) StubRoutines::x86::_adler32_shuf0_table), r9);
__ vmovdqu(yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_shuf1_table), r9);
__ movptr(data, c_rarg1); //data
__ movl(size, c_rarg2); //length
__ updateBytesAdler32(c_rarg0, data, size, yshuf0, yshuf1, ExternalAddress((address) StubRoutines::x86::_adler32_ascale_table));
__ leave();
__ ret(0);
return start;
}

/**
* Arguments:
*
Expand Down Expand Up @@ -6754,6 +6795,11 @@ address generate_avx_ghash_processBlocks() {
StubRoutines::_crc32c_table_addr = (address)StubRoutines::x86::_crc32c_table;
StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(supports_clmul);
}

if (UseAdler32Intrinsics) {
StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
}

if (UseLibmIntrinsic && InlineIntrinsics) {
if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin) ||
vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos) ||
Expand Down
19 changes: 19 additions & 0 deletions src/hotspot/cpu/x86/stubRoutines_x86.cpp
Expand Up @@ -224,6 +224,25 @@ juint StubRoutines::x86::_shuf_table_crc32_avx512[] =
0x83828100UL, 0x87868584UL, 0x8b8a8988UL, 0x8f8e8d8cUL,
0x03020100UL, 0x07060504UL, 0x0b0a0908UL, 0x000e0d0cUL
};

juint StubRoutines::x86::_adler32_ascale_table[] =
{
0x00000000UL, 0x00000001UL, 0x00000002UL, 0x00000003UL,
0x00000004UL, 0x00000005UL, 0x00000006UL, 0x00000007UL
};

juint StubRoutines::x86::_adler32_shuf0_table[] =
{
0xFFFFFF00UL, 0xFFFFFF01UL, 0xFFFFFF02UL, 0xFFFFFF03UL,
0xFFFFFF04UL, 0xFFFFFF05UL, 0xFFFFFF06UL, 0xFFFFFF07UL
};

juint StubRoutines::x86::_adler32_shuf1_table[] =
{
0xFFFFFF08UL, 0xFFFFFF09, 0xFFFFFF0AUL, 0xFFFFFF0BUL,
0xFFFFFF0CUL, 0xFFFFFF0D, 0xFFFFFF0EUL, 0xFFFFFF0FUL
};

#endif // _LP64

#define D 32
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/stubRoutines_x86.hpp
Expand Up @@ -119,6 +119,9 @@ class x86 {
static juint _crc_by128_masks_avx512[];
static juint _crc_table_avx512[];
static juint _shuf_table_crc32_avx512[];
static juint _adler32_shuf0_table[];
static juint _adler32_shuf1_table[];
static juint _adler32_ascale_table[];
#endif // _LP64
// table for CRC32C
static juint* _crc32c_table;
Expand Down

3 comments on commit 8e3549f

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xbzhang99
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

/backport jdk11u-dev

@openjdk
Copy link

@openjdk openjdk bot commented on 8e3549f Jul 20, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@xbzhang99 only OpenJDK contributors can use the /backport command

Please sign in to comment.