Skip to content
This repository has been archived by the owner on May 16, 2023. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
8296602: RISC-V: improve performance of copy_memory stub
Backport-of: bd57e2138fc980822a149af905e572ab71ccbf11
  • Loading branch information
Vladimir Kempik committed Nov 24, 2022
1 parent 677a7c4 commit 7e340b3
Showing 1 changed file with 69 additions and 29 deletions.
98 changes: 69 additions & 29 deletions src/hotspot/cpu/riscv/stubGenerator_riscv.cpp
Expand Up @@ -869,7 +869,11 @@ class StubGenerator: public StubCodeGenerator {
//
/*
* if (is_aligned) {
* goto copy_8_bytes;
* if (count >= 32)
* goto copy32_loop;
* if (count >= 8)
* goto copy8_loop;
* goto copy_small;
* }
* bool is_backwards = step < 0;
* int granularity = uabs(step);
Expand All @@ -887,9 +891,12 @@ class StubGenerator: public StubCodeGenerator {
*
* if ((dst % 8) == (src % 8)) {
* aligned;
* goto copy8;
* goto copy_big;
* }
*
* copy_big:
* if the amount to copy is more than (or equal to) 32 bytes goto copy32_loop
* else goto copy8_loop
* copy_small:
* load element one by one;
* done;
Expand Down Expand Up @@ -950,10 +957,10 @@ class StubGenerator: public StubCodeGenerator {
bool is_backwards = step < 0;
int granularity = uabs(step);

const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17;
const Register src = x30, dst = x31, cnt = x15, tmp3 = x16, tmp4 = x17, tmp5 = x14, tmp6 = x13;

Label same_aligned;
Label copy8, copy_small, done;
Label copy_big, copy32_loop, copy8_loop, copy_small, done;

copy_insn ld_arr = NULL, st_arr = NULL;
switch (granularity) {
Expand Down Expand Up @@ -988,36 +995,69 @@ class StubGenerator: public StubCodeGenerator {
}

if (is_aligned) {
__ addi(tmp, cnt, -32);
__ bgez(tmp, copy32_loop);
__ addi(tmp, cnt, -8);
__ bgez(tmp, copy8);
__ bgez(tmp, copy8_loop);
__ j(copy_small);
}

__ mv(tmp, 16);
__ blt(cnt, tmp, copy_small);

__ xorr(tmp, src, dst);
__ andi(tmp, tmp, 0b111);
__ bnez(tmp, copy_small);
} else {
__ mv(tmp, 16);
__ blt(cnt, tmp, copy_small);

__ xorr(tmp, src, dst);
__ andi(tmp, tmp, 0b111);
__ bnez(tmp, copy_small);

__ bind(same_aligned);
__ andi(tmp, src, 0b111);
__ beqz(tmp, copy_big);
if (is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
(_masm->*ld_arr)(tmp3, Address(src), t0);
(_masm->*st_arr)(tmp3, Address(dst), t0);
if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
}
__ addi(cnt, cnt, -granularity);
__ beqz(cnt, done);
__ j(same_aligned);

__ bind(same_aligned);
__ andi(tmp, src, 0b111);
__ beqz(tmp, copy8);
__ bind(copy_big);
__ mv(tmp, 32);
__ blt(cnt, tmp, copy8_loop);
}
__ bind(copy32_loop);
if (is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
__ addi(src, src, -wordSize * 4);
__ addi(dst, dst, -wordSize * 4);
}
(_masm->*ld_arr)(tmp3, Address(src), t0);
(_masm->*st_arr)(tmp3, Address(dst), t0);
// we first load 32 bytes, then write it, so the direction here doesn't matter
__ ld(tmp3, Address(src));
__ ld(tmp4, Address(src, 8));
__ ld(tmp5, Address(src, 16));
__ ld(tmp6, Address(src, 24));
__ sd(tmp3, Address(dst));
__ sd(tmp4, Address(dst, 8));
__ sd(tmp5, Address(dst, 16));
__ sd(tmp6, Address(dst, 24));

if (!is_backwards) {
__ addi(src, src, step);
__ addi(dst, dst, step);
__ addi(src, src, wordSize * 4);
__ addi(dst, dst, wordSize * 4);
}
__ addi(cnt, cnt, -granularity);
__ beqz(cnt, done);
__ j(same_aligned);
__ addi(tmp, cnt, -(32 + wordSize * 4));
__ addi(cnt, cnt, -wordSize * 4);
__ bgez(tmp, copy32_loop); // cnt >= 32, do next loop

__ beqz(cnt, done); // if that's all - done

__ addi(tmp, cnt, -8); // if not - copy the reminder
__ bltz(tmp, copy_small); // cnt < 8, go to copy_small, else fall throught to copy8_loop

__ bind(copy8);
__ bind(copy8_loop);
if (is_backwards) {
__ addi(src, src, -wordSize);
__ addi(dst, dst, -wordSize);
Expand All @@ -1028,11 +1068,11 @@ class StubGenerator: public StubCodeGenerator {
__ addi(src, src, wordSize);
__ addi(dst, dst, wordSize);
}
__ addi(tmp, cnt, -(8 + wordSize));
__ addi(cnt, cnt, -wordSize);
__ addi(tmp4, cnt, -8);
__ bgez(tmp4, copy8); // cnt >= 8, do next loop
__ bgez(tmp, copy8_loop); // cnt >= 8, do next loop

__ beqz(cnt, done);
__ beqz(cnt, done); // if that's all - done

__ bind(copy_small);
if (is_backwards) {
Expand Down

1 comment on commit 7e340b3

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.