From e8a6b07be115ab970ad22b02e2075bfbc4acc60c Mon Sep 17 00:00:00 2001 From: Ningsheng Jian Date: Wed, 26 Apr 2023 12:22:02 +0800 Subject: [PATCH] 8307572: AArch64: Vector registers are clobbered by some macroassemblers I found that MacroAssembler::arrays_equals() would call stubcode, which may use vector registers. However, the call site in match rule does not claim the use of vector registers. Since c2 will allocate v16-v31 first [1], it's rare that using of v0-v7 will cause problem, but I did create a test case to expose the bug. Apart from arrays_equals, I also checked other macroassemblers, and found several similar issues. Fixed by claiming those vector register being killed in match rules call sites, which should have minimal performance impact compared to always saving/restoring those vector registers, since those V0-Vx registers are rarely allocated and live cross the macroassembler call. A jtreg test case is also added to demonstrate the failure. Test will fail without this patch, and pass with this patch. Test: I tried to update the allocation order in [1] to allocate V0-V15 first and then V16-V31, and full jtreg tests passed with the allocation order changed. I have also eyeballed and checked other macroassembler calls, and other macroassembler calls seemed fine. [1] https://github.com/openjdk/jdk/blob/master/src/hotspot/cpu/aarch64/aarch64.ad#L424 Change-Id: I0feb0c3f3761732a642b3080eb383e0d6ce77825 --- src/hotspot/cpu/aarch64/aarch64.ad | 125 +++++--- .../cpu/aarch64/c2_MacroAssembler_aarch64.cpp | 1 + .../cpu/aarch64/macroAssembler_aarch64.cpp | 16 +- .../cpu/aarch64/macroAssembler_aarch64.hpp | 6 +- .../cpu/aarch64/stubGenerator_aarch64.cpp | 4 + .../c2/aarch64/TestIntrinsicsRegStress.java | 296 ++++++++++++++++++ 6 files changed, 391 insertions(+), 57 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 77415989caf5b..16dda07d5579a 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -17078,14 +17078,17 @@ instruct string_compareUU_sve(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI %} instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, + iRegINoSp tmp3, iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17099,14 +17102,17 @@ instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17120,14 +17126,17 @@ instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2,iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, + TEMP tmp6, TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL) " + "# KILL $str1 cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -17141,14 +17150,15 @@ instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17162,14 +17172,15 @@ instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17183,14 +17194,15 @@ instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -17307,13 +17319,17 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -17328,13 +17344,17 @@ instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -17364,36 +17384,39 @@ instruct count_positives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg // fast char[] to byte[] compression instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, - vRegD_V2 tmp3, vRegD_V3 tmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, USE_KILL src, USE_KILL dst, USE len, KILL cr); - format %{ "String Compress $src,$dst,$len -> $result // KILL $src,$dst" %} + format %{ "String Compress $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, - $result$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$FloatRegister); + $result$$Register, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_slow); %} // fast byte[] to char[] inflation -instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr) +instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, iRegP_R3 tmp, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, rFlagsReg cr) %{ match(Set dummy (StrInflatedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, + TEMP vtmp4, TEMP vtmp5, TEMP vtmp6, TEMP tmp, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + format %{ "String Inflate $src,$dst # KILL $tmp $src $dst $len V0-V6 cr" %} ins_encode %{ address tpc = __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$Register); + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $tmp$$Register); if (tpc == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; @@ -17404,41 +17427,43 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len // encode char[] to byte[] in ISO_8859_1 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 vtmp0, vRegD_V1 vtmp1, - vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ predicate(!((EncodeISOArrayNode*)n)->is_ascii()); match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE len, - KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode ISO array $src,$dst,$len -> $result" %} + format %{ "Encode ISO array $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $result$$Register, false, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, - $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_class_memory); %} instruct encode_ascii_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 vtmp0, vRegD_V1 vtmp1, - vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ predicate(((EncodeISOArrayNode*)n)->is_ascii()); match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE len, - KILL vtmp0, KILL vtmp1, KILL vtmp2, KILL vtmp3, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode ASCII array $src,$dst,$len -> $result" %} + format %{ "Encode ASCII array $src,$dst,$len -> $result # KILL $src $dst V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, $result$$Register, true, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, - $vtmp2$$FloatRegister, $vtmp3$$FloatRegister); + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe(pipe_class_memory); %} diff --git a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp index e96621ae2d378..dbe64f8f9ca74 100644 --- a/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp @@ -46,6 +46,7 @@ typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); // Search for str1 in str2 and return index or -1 +// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. void C2_MacroAssembler::string_indexof(Register str2, Register str1, Register cnt2, Register cnt1, Register tmp1, Register tmp2, diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index 350f8082c3419..6ca72ccc65b9a 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -5008,6 +5008,8 @@ address MacroAssembler::count_positives(Register ary1, Register len, Register re return pc(); } +// Clobbers: rscratch1, rscratch2, rflags +// May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals) address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, Register tmp4, Register tmp5, Register result, Register cnt1, int elem_size) { @@ -5557,10 +5559,12 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) // Using 'umaxv' in the ASCII-case comes with a small penalty but does // avoid additional bloat. // +// Clobbers: src, dst, res, rscratch1, rscratch2, rflags void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, Register res, bool ascii, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3) + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5) { Register cnt = res; Register max = rscratch1; @@ -5579,8 +5583,8 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, br(LT, DONE_32); ld1(vtmp0, vtmp1, vtmp2, vtmp3, T8H, Address(post(src, 64))); // Extract lower bytes. - FloatRegister vlo0 = v4; - FloatRegister vlo1 = v5; + FloatRegister vlo0 = vtmp4; + FloatRegister vlo1 = vtmp5; uzp1(vlo0, T16B, vtmp0, vtmp1); uzp1(vlo1, T16B, vtmp2, vtmp3); // Merge bits... @@ -5653,6 +5657,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, } // Inflate byte[] array to char[]. +// Clobbers: src, dst, len, rflags, rscratch1, v0-v6 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, Register tmp4) { @@ -5761,8 +5766,9 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register void MacroAssembler::char_array_compress(Register src, Register dst, Register len, Register res, FloatRegister tmp0, FloatRegister tmp1, - FloatRegister tmp2, FloatRegister tmp3) { - encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3); + FloatRegister tmp2, FloatRegister tmp3, + FloatRegister tmp4, FloatRegister tmp5) { + encode_iso_array(src, dst, len, res, false, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5); // Adjust result: res == len ? len : 0 cmp(len, res); csel(res, res, zr, EQ); diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 6211f1e74f208..05ba5861f019d 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1393,12 +1393,14 @@ class MacroAssembler: public Assembler { void char_array_compress(Register src, Register dst, Register len, Register res, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3); + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5); void encode_iso_array(Register src, Register dst, Register len, Register res, bool ascii, FloatRegister vtmp0, FloatRegister vtmp1, - FloatRegister vtmp2, FloatRegister vtmp3); + FloatRegister vtmp2, FloatRegister vtmp3, + FloatRegister vtmp4, FloatRegister vtmp5); void fast_log(FloatRegister vtmp0, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, FloatRegister vtmp4, FloatRegister vtmp5, diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index 389bb0d7d0e88..ed3602892988a 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -5151,6 +5151,7 @@ class StubGenerator: public StubCodeGenerator { // result = r0 - return value. Already contains "false" // cnt1 = r10 - amount of elements left to check, reduced by wordSize // r3-r5 are reserved temporary registers + // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 address generate_large_array_equals() { Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, @@ -5734,6 +5735,8 @@ class StubGenerator: public StubCodeGenerator { // R2 = cnt1 // R3 = str1 // R4 = cnt2 + // Clobbers: rscratch1, rscratch2, v0, v1, rflags + // // This generic linear code use few additional ideas, which makes it faster: // 1) we can safely keep at least 1st register of pattern(since length >= 8) // in order to skip initial loading(help in systems with 1 ld pipeline) @@ -6048,6 +6051,7 @@ class StubGenerator: public StubCodeGenerator { // R3 = len >> 3 // V0 = 0 // v1 = loaded 8 bytes + // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 address generate_large_byte_array_inflate() { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java new file mode 100644 index 0000000000000..960661b975a54 --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2023, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8307572 + * @summary Verify vector register clobbering in some aarch64 intrinsics + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @run main/othervm -Xbatch -XX:CompileThreshold=100 -XX:-TieredCompilation compiler.c2.aarch64.TestIntrinsicsRegStress + */ + +package compiler.c2.aarch64; + +import java.util.Arrays; + +public class TestIntrinsicsRegStress { + + final int LENGTH = 1024; + final int ITER = 10000; + final int NUM = 32; + + byte[] ba; + char[] ca; + char[] cb; + float[] fv; + + String str; + String[] strings; + String needle = "01234567890123456789"; + + public void init() { + ca = new char[LENGTH]; + fv = new float[NUM]; + strings = new String[NUM]; + for (int i = 0; i < LENGTH; i++) { + ca[i] = (char) ('a' + i % NUM); + } + cb = ca.clone(); + str = new String(ca); + for (int i = 0; i < NUM; i++) { + fv[i] = 1; + } + for (int i = 0; i < NUM; i++) { + strings[i] = str.substring(i) + needle; + } + } + + public void checkIndexOf(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + int result = strings[iter % NUM].indexOf(needle); + + if (result > LENGTH - NUM / 2) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testIndexOf() { + for (int i = 0; i < ITER; i++) { + checkIndexOf(i); + } + } + + public void checkArraysEquals() { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + if (Arrays.equals(ca, cb)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testArraysEquals() { + for (int i = 0; i < ITER; i++) { + checkArraysEquals(); + } + } + + public void checkCompress(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + ba = Helper.compressChar(ca, 0, LENGTH, 0, LENGTH); + + if (ba[iter % LENGTH] > (byte) ('a' + 5)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testCompress() { + for (int i = 0; i < ITER; i++) { + checkCompress(i); + } + } + + public void checkInflate(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + str.getChars(0, LENGTH, ca, 0); + + if (ca[iter % LENGTH] > (byte) ('a' + NUM / 2)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testInflate() { + for (int i = 0; i < ITER; i++) { + checkInflate(i); + } + } + + public void verifyAndReset() { + if (fv[31] != 1.0) { + throw new RuntimeException("Failed with " + Float.toString(fv[31])); + } else { + System.out.println("Success!"); + } + fv[31] = 1.0f; + } + + public static void main(String[] args) { + TestIntrinsicsRegStress t = new TestIntrinsicsRegStress(); + t.init(); + + t.testIndexOf(); + t.verifyAndReset(); + + t.testArraysEquals(); + t.verifyAndReset(); + + t.testCompress(); + t.verifyAndReset(); + + t.testInflate(); + t.verifyAndReset(); + } +}