Skip to content
Closed
Changes from 1 commit
Commits
Show all changes
37 commits
Select commit Hold shift + click to select a range
3e2aab6
Add Unsafe.setMemory as intrinsic
asgibbons Mar 27, 2024
2334b03
Added actual intrinsic
asgibbons Mar 27, 2024
6eebcbd
Removed setMemory1; debugged intrinsic code
asgibbons Mar 28, 2024
7c73856
Test removing intrinsic
asgibbons Mar 28, 2024
74c47e2
Add benchmark
asgibbons Mar 28, 2024
6e283bc
Restore intrinsic
asgibbons Mar 28, 2024
44c24ec
Address review comment
asgibbons Mar 29, 2024
b17a1f4
Fixed bug - incorrect interface to *_fill_entry
asgibbons Mar 29, 2024
401a2a9
Clean up code for PR
asgibbons Mar 29, 2024
c5cb30c
Use non-sse fill (old left in)
asgibbons Apr 1, 2024
6ee69c8
Remove dead code
asgibbons Apr 1, 2024
3aa60a4
Addressing review comments.
asgibbons Apr 2, 2024
8bed156
Fix Windows
asgibbons Apr 3, 2024
b025318
Fixed generate_fill when count > 0x80000000
asgibbons Apr 5, 2024
fd6f04f
Oops
asgibbons Apr 6, 2024
f81aaa9
Add movq to locate_operand
asgibbons Apr 8, 2024
b0ac857
Address review comments (#15)
asgibbons Apr 11, 2024
95230e2
Set memory test (#16)
asgibbons Apr 11, 2024
41ffcc3
Merge master
asgibbons Apr 11, 2024
b99499a
Fix whitespace error.
asgibbons Apr 11, 2024
89db3eb
Addressing more review comments
asgibbons Apr 11, 2024
970c575
Addressing yet more review comments
asgibbons Apr 12, 2024
6e731c8
Even more review comments
asgibbons Apr 12, 2024
405e4e0
Change fill routines
asgibbons Apr 15, 2024
95b0a34
Rename UnsafeCopyMemory{,Mark} to UnsafeMemory{Access,Mark} (#19)
asgibbons Apr 15, 2024
44cc91b
Only add a memory mark for byte unaligned fill
asgibbons Apr 15, 2024
824fb60
Set memory test (#21)
asgibbons Apr 15, 2024
80b5a0c
Set memory test (#22)
asgibbons Apr 15, 2024
856464e
Set memory test (#23)
asgibbons Apr 15, 2024
116d7dd
Merge branch 'openjdk:master' into setMemory
asgibbons Apr 15, 2024
113aa90
Fix memory mark after sync to upstream
asgibbons Apr 15, 2024
7a1d67e
Add enter() and leave(); remove Windows-specific register stuff
asgibbons Apr 16, 2024
dccf6b6
Address review comments; update copyright years
asgibbons Apr 19, 2024
dd0094e
Review comments
asgibbons Apr 19, 2024
1961624
Long to short jmp; other cleanup
asgibbons Apr 19, 2024
c129016
Fix UnsafeCopyMemoryMark scope issue
asgibbons Apr 20, 2024
1122b50
Merge branch 'openjdk:master' into setMemory
asgibbons Apr 20, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
282 changes: 134 additions & 148 deletions src/hotspot/cpu/x86/stubGenerator_x86_64_arraycopy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2482,9 +2482,94 @@ address StubGenerator::generate_unsafe_copy(const char *name,
}


// Static enum for helper
enum USM_TYPE {USM_SHORT, USM_DWORD, USM_QUADWORD};
// Helper for generate_unsafe_setmemory
//
// Atomically fill an array of memory using 2-, 4-, or 8-byte chunks
static void do_setmemory_atomic_loop(USM_TYPE type, Register dest,
Register size, Register wide_value,
Register tmp, Label& L_exit,
MacroAssembler *_masm) {
Label L_Loop, L_Tail, L_TailLoop;

int shiftval = 0;
int incr = 0;

switch (type) {
case USM_SHORT:
shiftval = 1;
incr = 16;
break;
case USM_DWORD:
shiftval = 2;
incr = 32;
break;
case USM_QUADWORD:
shiftval = 3;
incr = 64;
break;
}

// At this point, we know the lower bits of size are zero
__ shrq(size, shiftval);
// size now has number of X-byte chunks (2, 4 or 8)
__ cmpq(size, 8);
__ jccb(Assembler::below, L_Tail);

// Number of (8*X)-byte chunks into rScratch1
__ movq(tmp, size);
__ shrq(tmp, 3);
Copy link
Member

@JornVernee JornVernee Apr 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

shr sets the zero flag, so I think you can just move the jump to after the shift and avoid a separate comparison

Suggested change
__ cmpq(size, 8);
__ jccb(Assembler::below, L_Tail);
// Number of (8*X)-byte chunks into rScratch1
__ movq(tmp, size);
__ shrq(tmp, 3);
// Number of (8*X)-byte chunks into rScratch1
__ movq(tmp, size);
__ shrq(tmp, 3);
__ jccb(Assembler::zero, L_Tail);

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good catch. Done.


__ BIND(L_Loop);

// Unroll 8 stores
for (int i = 0; i < 8; i++) {
switch (type) {
case USM_SHORT:
__ movw(Address(dest, (2 * i)), wide_value);
Copy link
Member

@jatin-bhateja jatin-bhateja Apr 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

MOVW emits an extra Operand Size Override prefix byte compared to 32 and 64 bit stores, any specific reason for keeping same unroll factor for all the stores.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding is the spec requires the appropriate-sized write based on alignment and size. This is why there's no 128-bit or 256-bit store loops.

break;
case USM_DWORD:
__ movl(Address(dest, (4 * i)), wide_value);
break;
case USM_QUADWORD:
__ movq(Address(dest, (8 * i)), wide_value);
break;
}
}
Comment on lines +2527 to +2539
Copy link
Member

@jatin-bhateja jatin-bhateja Apr 20, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I understand we want to be as accurate as possible in filling the tail in an event of SIGBUS, but we are anyways creating a wide value for 8 packed bytes if destination segment was quadword aligned, aligned quadword stores are implicitly atomic on x86 targets, what's your thoughts on using a vector instruction based loop.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe the spec is specific on the size of the store required given alignment and size. I want to honor that spec even though wider stores could be done in many cases.

__ addq(dest, incr);
__ decrementq(tmp);
__ jccb(Assembler::notZero, L_Loop);

__ BIND(L_Tail);

// Find number of remaining X-byte chunks
__ andq(size, 0x7);

// If zero, then we're done
__ jccb(Assembler::zero, L_exit);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Code in generate_unsafe_setmemory() uses long jumps to L_exit but here you use short. Why?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah - the original code (3 iterations ago) was about 10 bytes too long for a short jump. It's short enough now. Changed.


__ BIND(L_TailLoop);

switch (type) {
case USM_SHORT:
__ movw(Address(dest, 0), wide_value);
break;
case USM_DWORD:
__ movl(Address(dest, 0), wide_value);
break;
case USM_QUADWORD:
__ movq(Address(dest, 0), wide_value);
break;
}
__ addq(dest, incr >> 3);
__ decrementq(size);
__ jccb(Assembler::notZero, L_TailLoop);
}

// Generate 'unsafe' set memory stub
// Though just as safe as the other stubs, it takes an unscaled
// size_t argument instead of an element count.
// size_t (# bytes) argument instead of an element count.
//
// Input:
// c_rarg0 - destination array address
Expand All @@ -2494,7 +2579,8 @@ address StubGenerator::generate_unsafe_copy(const char *name,
// Examines the alignment of the operands and dispatches
// to an int, short, or byte fill loop.
//
address StubGenerator::generate_unsafe_setmemory(const char *name, address unsafe_byte_fill) {
address StubGenerator::generate_unsafe_setmemory(const char *name,
address unsafe_byte_fill) {
__ align(CodeEntryAlignment);
StubCodeMark mark(this, "StubRoutines", name);
address start = __ pc();
Expand All @@ -2517,8 +2603,6 @@ address StubGenerator::generate_unsafe_setmemory(const char *name, address unsaf
const Register size = rsi;
const Register wide_value = rax;
const Register rScratch1 = rcx;
const Register rScratch3 = r8;
const Register rScratch4 = r11;

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe put an assert_different_registers here for the above registers, just to be sure. (I see you are avoiding the existing rscratch1 already, because of a conflict with c_rarg2)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done.

// fill_to_memory_atomic(unsigned char*, unsigned long, unsigned char)

Expand All @@ -2528,167 +2612,69 @@ address StubGenerator::generate_unsafe_setmemory(const char *name, address unsaf
{
const Register byteVal = rdx;

// Propagate byte to full Register
__ movzbl(rScratch1, byteVal);
__ mov64(wide_value, 0x0101010101010101);
__ imulq(wide_value, rScratch1);

// Check for pointer & size alignment
__ movq(rScratch1, dest);
__ orq(rScratch1, size);

// Propagate byte to full Register
__ movzbl(rScratch3, byteVal);
__ mov64(wide_value, 0x0101010101010101);
__ imulq(wide_value, rScratch3);
}

{
const Register rScratch2 = rdx;

__ testb(rScratch1, 7);
__ jcc(Assembler::equal, L_fillQuadwords);

__ testb(rScratch1, 3);
__ jcc(Assembler::equal, L_fillDwords);

__ testb(rScratch1, 1);
__ jcc(Assembler::notEqual, L_fillBytes);

// Fill words
{
Label L_wordsTail, L_wordsLoop, L_wordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);
////// Set words
__ leaq(rScratch2, Address(size, 1));
__ movq(rScratch1, rScratch2);
__ shrq(rScratch1, 4);
__ cmpq(rScratch2, 16);
__ jccb(Assembler::below, L_wordsTail);
__ leaq(rScratch3, Address(dest, 14));
__ movq(rScratch4, rScratch1);

__ BIND(L_wordsLoop);

// Unroll 8 word stores
for (int i = 7; i >= 0; i--) {
__ movw(Address(rScratch3, -(2 * i)), wide_value);
}

__ addq(rScratch3, 16);
__ decrementq(rScratch4);
__ jccb(Assembler::notEqual, L_wordsLoop);
__ testb(rScratch1, 7);
__ jcc(Assembler::equal, L_fillQuadwords);

__ BIND(L_wordsTail);
__ testb(rScratch1, 3);
__ jcc(Assembler::equal, L_fillDwords);

// Handle leftovers
__ shlq(rScratch1, 3);
__ shrq(rScratch2, 1);
__ cmpq(rScratch1, rScratch2);
__ jccb(Assembler::aboveEqual, L_exit);
__ decrementq(size);
__ shrq(size, 1);
__ incrementq(size);
__ testb(rScratch1, 1);
__ jcc(Assembler::notEqual, L_fillBytes);

__ BIND(L_wordsTailLoop);

__ movw(Address(dest, rScratch1, Address::times_2), wide_value);
__ incrementq(rScratch1);
__ cmpq(size, rScratch1);
__ jccb(Assembler::notEqual, L_wordsTailLoop);
}
__ jmp(L_exit);

__ BIND(L_fillQuadwords);

// Fill QUADWORDs
{
Label L_qwordLoop, L_qwordsTail, L_qwordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);

__ leaq(rScratch2, Address(size, 7));
__ movq(rScratch1, rScratch2);
__ shrq(rScratch1, 6);
__ cmpq(rScratch2, 64);
__ jccb(Assembler::below, L_qwordsTail);
__ leaq(rScratch3, Address(dest, 56));
__ movq(rScratch4, rScratch1);

__ BIND(L_qwordLoop);
// Fill words
{
Label L_wordsTail, L_wordsLoop, L_wordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);

// Unroll 8 qword stores
for (int i = 7; i >= 0; i--) {
__ movq(Address(rScratch3, -(8 * i)), wide_value);
}
__ addq(rScratch3, 64);
__ decrementq(rScratch4);
__ jccb(Assembler::notZero, L_qwordLoop);

__ BIND(L_qwordsTail);

// Handle leftovers
__ shlq(rScratch1, 3);
__ shrq(rScratch2, 3);
__ cmpq(rScratch1, rScratch2);
__ jccb(Assembler::aboveEqual, L_exit);
__ decrementq(size);
__ shrq(size, 3);
__ incrementq(size);

__ BIND(L_qwordsTailLoop);

__ movq(Address(dest, rScratch1, Address::times_8), wide_value);
__ incrementq(rScratch1);
__ cmpq(size, rScratch1);
__ jccb(Assembler::notEqual, L_qwordsTailLoop);
}
__ BIND(L_exit);
// At this point, we know the lower bit of size is zero and a
// multiple of 2
do_setmemory_atomic_loop(USM_SHORT, dest, size, wide_value, rScratch1,
L_exit, _masm);
}
__ jmp(L_exit);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is long jump to L_exit after do_setmemory_atomic_loop() call. Should this be also short jump?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we have additional code in debug VM wihch increase distance and requires long jump? I don't see it. Usually it something which call __ STOP().

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old code required a long jump due to the size of do_setmemory_atomic_loop but has since been refactored. The jmp(Label) code will generate a short jump provided the label has been defined and is in range. Otherwise a long jump is generated.

Changed to jmpb


restore_arg_regs();
__ ret(0);
__ BIND(L_fillQuadwords);

__ BIND(L_fillDwords);
// Fill QUADWORDs
{
Label L_qwordLoop, L_qwordsTail, L_qwordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);

// Fill DWORDs
{
Label L_dwordLoop, L_dwordsTail, L_dwordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);
// At this point, we know the lower 3 bits of size are zero and a
// multiple of 8
do_setmemory_atomic_loop(USM_QUADWORD, dest, size, wide_value, rScratch1,
L_exit, _masm);
}
__ BIND(L_exit);

__ leaq(rScratch2, Address(size, 3));
__ movq(rScratch1, rScratch2);
__ shrq(rScratch1, 5);
__ cmpq(rScratch2, 32);
__ jccb(Assembler::below, L_dwordsTail);
__ leaq(rScratch3, Address(dest, 28));
__ movq(rScratch4, rScratch1);
restore_arg_regs();
__ ret(0);

__ BIND(L_dwordLoop);
__ BIND(L_fillDwords);

// Unroll 8 dword stores
for (int i = 7; i >= 0; i--) {
__ movl(Address(rScratch3, -(i * 4)), wide_value);
}
__ addq(rScratch3, 32);
__ decrementq(rScratch4);
__ jccb(Assembler::notZero, L_dwordLoop);

__ BIND(L_dwordsTail);

// Handle leftovers
__ shlq(rScratch1, 3);
__ shrq(rScratch2, 2);
__ cmpq(rScratch1, rScratch2);
__ jccb(Assembler::aboveEqual, L_exit);
__ decrementq(size);
__ shrq(size, 2);
__ incrementq(size);

__ BIND(L_dwordsTailLoop);

__ movl(Address(dest, rScratch1, Address::times_4), wide_value);
__ incrementq(rScratch1);
__ cmpq(size, rScratch1);
__ jccb(Assembler::notEqual, L_dwordsTailLoop);
}
__ jmp(L_exit);
// Fill DWORDs
{
Label L_dwordLoop, L_dwordsTail, L_dwordsTailLoop;
UnsafeSetMemoryMark usmm(this, true, true);

__ BIND(L_fillBytes);
// At this point, we know the lower 2 bits of size are zero and a
// multiple of 4
do_setmemory_atomic_loop(USM_DWORD, dest, size, wide_value, rScratch1,
L_exit, _masm);
}
__ jmp(L_exit);

__ BIND(L_fillBytes);
#ifdef MUSL_LIBC
{
Label L_byteLoop, L_longByteLoop, L_byteTail, L_byteTailLoop;
Expand Down