Skip to content

Commit e836396

Browse files
eastigsimonis
authored andcommitted
8257436: [aarch64] Regressions in ArrayCopyUnalignedDst.testByte/testChar for 65-78 bytes when UseSIMDForMemoryOps is on
Reviewed-by: simonis
1 parent b170c83 commit e836396

File tree

1 file changed

+23
-2
lines changed

1 file changed

+23
-2
lines changed

src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp

Lines changed: 23 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,10 +1094,10 @@ class StubGenerator: public StubCodeGenerator {
10941094
Register count, Register tmp, int step) {
10951095
copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
10961096
bool is_backwards = step < 0;
1097-
int granularity = uabs(step);
1097+
unsigned int granularity = uabs(step);
10981098
const Register t0 = r3, t1 = r4;
10991099

1100-
// <= 96 bytes do inline. Direction doesn't matter because we always
1100+
// <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
11011101
// load all the data before writing anything
11021102
Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
11031103
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
@@ -1154,7 +1154,28 @@ class StubGenerator: public StubCodeGenerator {
11541154
if (UseSIMDForMemoryOps) {
11551155
__ ldpq(v0, v1, Address(s, 0));
11561156
__ ldpq(v2, v3, Address(s, 32));
1157+
// Unaligned pointers can be an issue for copying.
1158+
// The issue has more chances to happen when granularity of data is
1159+
// less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1160+
// 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1161+
// The most performance drop has been seen for the range 65-80 bytes.
1162+
// For such cases using the pair of ldp/stp instead of the third pair of
1163+
// ldpq/stpq fixes the performance issue.
1164+
if (granularity < sizeof (jint)) {
1165+
Label copy96;
1166+
__ cmp(count, u1(80/granularity));
1167+
__ br(Assembler::HI, copy96);
1168+
__ ldp(t0, t1, Address(send, -16));
1169+
1170+
__ stpq(v0, v1, Address(d, 0));
1171+
__ stpq(v2, v3, Address(d, 32));
1172+
__ stp(t0, t1, Address(dend, -16));
1173+
__ b(finish);
1174+
1175+
__ bind(copy96);
1176+
}
11571177
__ ldpq(v4, v5, Address(send, -32));
1178+
11581179
__ stpq(v0, v1, Address(d, 0));
11591180
__ stpq(v2, v3, Address(d, 32));
11601181
__ stpq(v4, v5, Address(dend, -32));

0 commit comments

Comments
 (0)