@@ -1094,10 +1094,10 @@ class StubGenerator: public StubCodeGenerator {
1094
1094
Register count, Register tmp, int step) {
1095
1095
copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1096
1096
bool is_backwards = step < 0 ;
1097
- int granularity = uabs (step);
1097
+ unsigned int granularity = uabs (step);
1098
1098
const Register t0 = r3, t1 = r4;
1099
1099
1100
- // <= 96 bytes do inline. Direction doesn't matter because we always
1100
+ // <= 80 (or 96 for SIMD) bytes do inline. Direction doesn't matter because we always
1101
1101
// load all the data before writing anything
1102
1102
Label copy4, copy8, copy16, copy32, copy80, copy_big, finish;
1103
1103
const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
@@ -1154,7 +1154,28 @@ class StubGenerator: public StubCodeGenerator {
1154
1154
if (UseSIMDForMemoryOps) {
1155
1155
__ ldpq (v0, v1, Address (s, 0 ));
1156
1156
__ ldpq (v2, v3, Address (s, 32 ));
1157
+ // Unaligned pointers can be an issue for copying.
1158
+ // The issue has more chances to happen when granularity of data is
1159
+ // less than 4(sizeof(jint)). Pointers for arrays of jint are at least
1160
+ // 4 byte aligned. Pointers for arrays of jlong are 8 byte aligned.
1161
+ // The most performance drop has been seen for the range 65-80 bytes.
1162
+ // For such cases using the pair of ldp/stp instead of the third pair of
1163
+ // ldpq/stpq fixes the performance issue.
1164
+ if (granularity < sizeof (jint)) {
1165
+ Label copy96;
1166
+ __ cmp (count, u1 (80 /granularity));
1167
+ __ br (Assembler::HI, copy96);
1168
+ __ ldp (t0, t1, Address (send, -16 ));
1169
+
1170
+ __ stpq (v0, v1, Address (d, 0 ));
1171
+ __ stpq (v2, v3, Address (d, 32 ));
1172
+ __ stp (t0, t1, Address (dend, -16 ));
1173
+ __ b (finish);
1174
+
1175
+ __ bind (copy96);
1176
+ }
1157
1177
__ ldpq (v4, v5, Address (send, -32 ));
1178
+
1158
1179
__ stpq (v0, v1, Address (d, 0 ));
1159
1180
__ stpq (v2, v3, Address (d, 32 ));
1160
1181
__ stpq (v4, v5, Address (dend, -32 ));
0 commit comments