From 9940bbd63286c3eac2bb3aca1f82340188159b56 Mon Sep 17 00:00:00 2001 From: Ningsheng Jian Date: Wed, 17 May 2023 12:22:04 +0800 Subject: [PATCH] Backport 33d9a857308eed53e06b448691910bc8aa2f8fc9 jdk11 also has the bug of JDK-8307572, so I propose to backport the fix to jdk11u. This is not a clean backport. Main changes: 1) Definition of reg_class v4_reg..v7_reg, and operand vRegD_V4..vRegD_V7 are missing in jdk11u aarch64.ad, which was introduced in JDK-8214527. Added those definitions in this patch, as we need to claim them to be killed in match rules. 2) JDK-8274243 (not a valid backport for 11u) changed MacroAssembler::encode_iso_array() a lot, but the bug of invalid use of v4/v5 still exists before and after that patch. Fixed the conflicts by using 11u naming convention. No new jtreg test failure with both release and fastdebug build. Test case added in JDK-8307572 passes now, which fails without the backport. --- src/hotspot/cpu/aarch64/aarch64.ad | 170 +++++++--- .../cpu/aarch64/macroAssembler_aarch64.cpp | 32 +- .../cpu/aarch64/macroAssembler_aarch64.hpp | 4 +- .../cpu/aarch64/stubGenerator_aarch64.cpp | 4 + .../c2/aarch64/TestIntrinsicsRegStress.java | 296 ++++++++++++++++++ 5 files changed, 449 insertions(+), 57 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 1e4ee33a9db..9d03d720480 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -971,6 +971,26 @@ reg_class v3_reg( V3, V3_H ); +// Class for 128 bit register v4 +reg_class v4_reg( + V4, V4_H +); + +// Class for 128 bit register v5 +reg_class v5_reg( + V5, V5_H +); + +// Class for 128 bit register v6 +reg_class v6_reg( + V6, V6_H +); + +// Class for 128 bit register v7 +reg_class v7_reg( + V7, V7_H +); + // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -4884,6 +4904,42 @@ operand vRegD_V3() interface(REG_INTER); %} +operand vRegD_V4() +%{ + constraint(ALLOC_IN_RC(v4_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V5() +%{ + constraint(ALLOC_IN_RC(v5_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V6() +%{ + constraint(ALLOC_IN_RC(v6_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V7() +%{ + constraint(ALLOC_IN_RC(v7_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + // Flags register, used as output of signed compare instructions // note that on AArch64 we also use this register as the output for @@ -15390,14 +15446,17 @@ instruct string_compareLU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 %} instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, + iRegINoSp tmp3, iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15411,14 +15470,17 @@ instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15432,14 +15494,17 @@ instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, + TEMP tmp6, TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL) " + "# KILL $str1 cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15453,14 +15518,15 @@ instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15474,14 +15540,15 @@ instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15495,14 +15562,15 @@ instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15567,13 +15635,17 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -15588,13 +15660,17 @@ instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -15624,35 +15700,40 @@ instruct has_negatives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg c // fast char[] to byte[] compression instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, - vRegD_V2 tmp3, vRegD_V3 tmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Compress $src,$dst -> $result // KILL R1, R2, R3, R4" %} + format %{ "String Compress $src,$dst -> $result # KILL $src $dst $len V0-V5 cr" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$FloatRegister, + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister, $result$$Register); %} ins_pipe( pipe_slow ); %} // fast byte[] to char[] inflation -instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr) +instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, iRegP_R3 tmp, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, rFlagsReg cr) %{ match(Set dummy (StrInflatedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, + TEMP vtmp4, TEMP vtmp5, TEMP vtmp6, TEMP tmp, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + format %{ "String Inflate $src,$dst # KILL $tmp $src $dst $len V0-V6 cr" %} ins_encode %{ address tpc = __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$Register); + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $tmp$$Register); if (tpc == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; @@ -15663,19 +15744,20 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len // encode char[] to byte[] in ISO_8859_1 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 Vtmp1, vRegD_V1 Vtmp2, - vRegD_V2 Vtmp3, vRegD_V3 Vtmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE_KILL len, - KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE_KILL len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode array $src,$dst,$len -> $result" %} + format %{ "Encode array $src,$dst,$len -> $result # KILL $src $dst $len V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, - $result$$Register, $Vtmp1$$FloatRegister, $Vtmp2$$FloatRegister, - $Vtmp3$$FloatRegister, $Vtmp4$$FloatRegister); + $result$$Register, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe( pipe_class_memory ); %} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index 5753cc9a611..7f329a45d30 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -4332,6 +4332,7 @@ void MacroAssembler::remove_frame(int framesize) { typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); // Search for str1 in str2 and return index or -1 +// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. void MacroAssembler::string_indexof(Register str2, Register str1, Register cnt2, Register cnt1, Register tmp1, Register tmp2, @@ -5123,6 +5124,8 @@ address MacroAssembler::has_negatives(Register ary1, Register len, Register resu return pc(); } +// Clobbers: rscratch1, rscratch2, rflags +// May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals) address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, Register tmp4, Register tmp5, Register result, Register cnt1, int elem_size) { @@ -5615,10 +5618,13 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and // java/lang/StringUTF16.compress. +// +// Clobbers: src, dst, res, rscratch1, rscratch2, rflags void MacroAssembler::encode_iso_array(Register src, Register dst, - Register len, Register result, - FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4) + Register len, Register result, + FloatRegister Vtmp1, FloatRegister Vtmp2, + FloatRegister Vtmp3, FloatRegister Vtmp4, + FloatRegister Vtmp5, FloatRegister Vtmp6) { Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, NEXT_32_START, NEXT_32_PRFM_START; @@ -5641,13 +5647,13 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_PRFM_START); prfm(Address(src, SoftwarePrefetchHintDistance)); - orr(v4, T16B, Vtmp1, Vtmp2); - orr(v5, T16B, Vtmp3, Vtmp4); + orr(Vtmp5, T16B, Vtmp1, Vtmp2); + orr(Vtmp6, T16B, Vtmp3, Vtmp4); uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); - uzp2(v5, T16B, v4, v5); // high bytes - umov(tmp2, v5, D, 1); - fmovd(tmp1, v5); + uzp2(Vtmp6, T16B, Vtmp5, Vtmp6); // high bytes + umov(tmp2, Vtmp6, D, 1); + fmovd(tmp1, Vtmp6); orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); stpq(Vtmp1, Vtmp3, dst); @@ -5666,8 +5672,8 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); } prfm(Address(src, SoftwarePrefetchHintDistance)); - uzp1(v4, T16B, Vtmp1, Vtmp2); - uzp1(v5, T16B, Vtmp3, Vtmp4); + uzp1(Vtmp5, T16B, Vtmp1, Vtmp2); + uzp1(Vtmp6, T16B, Vtmp3, Vtmp4); orr(Vtmp1, T16B, Vtmp1, Vtmp2); orr(Vtmp3, T16B, Vtmp3, Vtmp4); uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes @@ -5675,7 +5681,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, fmovd(tmp1, Vtmp1); orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); - stpq(v4, v5, dst); + stpq(Vtmp5, Vtmp6, dst); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); @@ -5720,6 +5726,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, // Inflate byte[] array to char[]. +// Clobbers: src, dst, len, rflags, rscratch1, v0-v6 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, Register tmp4) { @@ -5828,9 +5835,10 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register void MacroAssembler::char_array_compress(Register src, Register dst, Register len, FloatRegister tmp1Reg, FloatRegister tmp2Reg, FloatRegister tmp3Reg, FloatRegister tmp4Reg, + FloatRegister tmp5Reg, FloatRegister tmp6Reg, Register result) { encode_iso_array(src, dst, len, result, - tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); + tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg, tmp5Reg, tmp6Reg); cmp(len, zr); csel(result, result, zr, EQ); } diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 7e23c16a442..01fdf16a01c 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1245,12 +1245,14 @@ class MacroAssembler: public Assembler { void char_array_compress(Register src, Register dst, Register len, FloatRegister tmp1Reg, FloatRegister tmp2Reg, FloatRegister tmp3Reg, FloatRegister tmp4Reg, + FloatRegister tmp5Reg, FloatRegister tmp6Reg, Register result); void encode_iso_array(Register src, Register dst, Register len, Register result, FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4); + FloatRegister Vtmp3, FloatRegister Vtmp4, + FloatRegister Vtmp5, FloatRegister Vtmp6); void string_indexof(Register str1, Register str2, Register cnt1, Register cnt2, Register tmp1, Register tmp2, diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index bd4b5d7c13f..482784d6b7b 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4099,6 +4099,7 @@ class StubGenerator: public StubCodeGenerator { // result = r0 - return value. Already contains "false" // cnt1 = r10 - amount of elements left to check, reduced by wordSize // r3-r5 are reserved temporary registers + // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 address generate_large_array_equals() { Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, @@ -4503,6 +4504,8 @@ class StubGenerator: public StubCodeGenerator { // R2 = cnt1 // R3 = str1 // R4 = cnt2 + // Clobbers: rscratch1, rscratch2, v0, v1, rflags + // // This generic linear code use few additional ideas, which makes it faster: // 1) we can safely keep at least 1st register of pattern(since length >= 8) // in order to skip initial loading(help in systems with 1 ld pipeline) @@ -4817,6 +4820,7 @@ class StubGenerator: public StubCodeGenerator { // R3 = len >> 3 // V0 = 0 // v1 = loaded 8 bytes + // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 address generate_large_byte_array_inflate() { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java new file mode 100644 index 00000000000..960661b975a --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2023, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8307572 + * @summary Verify vector register clobbering in some aarch64 intrinsics + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @run main/othervm -Xbatch -XX:CompileThreshold=100 -XX:-TieredCompilation compiler.c2.aarch64.TestIntrinsicsRegStress + */ + +package compiler.c2.aarch64; + +import java.util.Arrays; + +public class TestIntrinsicsRegStress { + + final int LENGTH = 1024; + final int ITER = 10000; + final int NUM = 32; + + byte[] ba; + char[] ca; + char[] cb; + float[] fv; + + String str; + String[] strings; + String needle = "01234567890123456789"; + + public void init() { + ca = new char[LENGTH]; + fv = new float[NUM]; + strings = new String[NUM]; + for (int i = 0; i < LENGTH; i++) { + ca[i] = (char) ('a' + i % NUM); + } + cb = ca.clone(); + str = new String(ca); + for (int i = 0; i < NUM; i++) { + fv[i] = 1; + } + for (int i = 0; i < NUM; i++) { + strings[i] = str.substring(i) + needle; + } + } + + public void checkIndexOf(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + int result = strings[iter % NUM].indexOf(needle); + + if (result > LENGTH - NUM / 2) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testIndexOf() { + for (int i = 0; i < ITER; i++) { + checkIndexOf(i); + } + } + + public void checkArraysEquals() { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + if (Arrays.equals(ca, cb)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testArraysEquals() { + for (int i = 0; i < ITER; i++) { + checkArraysEquals(); + } + } + + public void checkCompress(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + ba = Helper.compressChar(ca, 0, LENGTH, 0, LENGTH); + + if (ba[iter % LENGTH] > (byte) ('a' + 5)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testCompress() { + for (int i = 0; i < ITER; i++) { + checkCompress(i); + } + } + + public void checkInflate(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + str.getChars(0, LENGTH, ca, 0); + + if (ca[iter % LENGTH] > (byte) ('a' + NUM / 2)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testInflate() { + for (int i = 0; i < ITER; i++) { + checkInflate(i); + } + } + + public void verifyAndReset() { + if (fv[31] != 1.0) { + throw new RuntimeException("Failed with " + Float.toString(fv[31])); + } else { + System.out.println("Success!"); + } + fv[31] = 1.0f; + } + + public static void main(String[] args) { + TestIntrinsicsRegStress t = new TestIntrinsicsRegStress(); + t.init(); + + t.testIndexOf(); + t.verifyAndReset(); + + t.testArraysEquals(); + t.verifyAndReset(); + + t.testCompress(); + t.verifyAndReset(); + + t.testInflate(); + t.verifyAndReset(); + } +}