diff --git a/src/hotspot/cpu/aarch64/aarch64.ad b/src/hotspot/cpu/aarch64/aarch64.ad index 1e4ee33a9db..9d03d720480 100644 --- a/src/hotspot/cpu/aarch64/aarch64.ad +++ b/src/hotspot/cpu/aarch64/aarch64.ad @@ -971,6 +971,26 @@ reg_class v3_reg( V3, V3_H ); +// Class for 128 bit register v4 +reg_class v4_reg( + V4, V4_H +); + +// Class for 128 bit register v5 +reg_class v5_reg( + V5, V5_H +); + +// Class for 128 bit register v6 +reg_class v6_reg( + V6, V6_H +); + +// Class for 128 bit register v7 +reg_class v7_reg( + V7, V7_H +); + // Singleton class for condition codes reg_class int_flags(RFLAGS); @@ -4884,6 +4904,42 @@ operand vRegD_V3() interface(REG_INTER); %} +operand vRegD_V4() +%{ + constraint(ALLOC_IN_RC(v4_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V5() +%{ + constraint(ALLOC_IN_RC(v5_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V6() +%{ + constraint(ALLOC_IN_RC(v6_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + +operand vRegD_V7() +%{ + constraint(ALLOC_IN_RC(v7_reg)); + match(RegD); + op_cost(0); + format %{ %} + interface(REG_INTER); +%} + // Flags register, used as output of signed compare instructions // note that on AArch64 we also use this register as the output for @@ -15390,14 +15446,17 @@ instruct string_compareLU(iRegP_R1 str1, iRegI_R2 cnt1, iRegP_R3 str2, iRegI_R4 %} instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, + iRegINoSp tmp3, iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15411,14 +15470,17 @@ instruct string_indexofUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, + TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15432,14 +15494,17 @@ instruct string_indexofLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 cnt2, - iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, - iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, rFlagsReg cr) + iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, iRegINoSp tmp3, + iRegINoSp tmp4, iRegINoSp tmp5, iRegINoSp tmp6, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, - TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, TEMP tmp6, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL)" %} + TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP tmp5, + TEMP tmp6, TEMP vtmp0, TEMP vtmp1, KILL cr); + format %{ "String IndexOf $str1,$cnt1,$str2,$cnt2 -> $result (UL) " + "# KILL $str1 cnt1 $str2 $cnt2 $tmp1 $tmp2 $tmp3 $tmp4 $tmp5 $tmp6 V0-V1 cr" %} ins_encode %{ __ string_indexof($str1$$Register, $str2$$Register, @@ -15453,14 +15518,15 @@ instruct string_indexofUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, iRegI_R2 %} instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UU) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15474,14 +15540,15 @@ instruct string_indexof_conUU(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_le_4 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (LL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15495,14 +15562,15 @@ instruct string_indexof_conLL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, %} instruct string_indexof_conUL(iRegP_R1 str1, iRegI_R4 cnt1, iRegP_R3 str2, - immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, iRegINoSp tmp2, - iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) + immI_1 int_cnt2, iRegI_R0 result, iRegINoSp tmp1, + iRegINoSp tmp2, iRegINoSp tmp3, iRegINoSp tmp4, rFlagsReg cr) %{ predicate(((StrIndexOfNode*)n)->encoding() == StrIntrinsicNode::UL); match(Set result (StrIndexOf (Binary str1 cnt1) (Binary str2 int_cnt2))); effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt1, TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, KILL cr); - format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL)" %} + format %{ "String IndexOf $str1,$cnt1,$str2,$int_cnt2 -> $result (UL) " + "# KILL $str1 $cnt1 $str2 $tmp1 $tmp2 $tmp3 $tmp4 cr" %} ins_encode %{ int icnt2 = (int)$int_cnt2$$constant; @@ -15567,13 +15635,17 @@ instruct string_equalsU(iRegP_R1 str1, iRegP_R3 str2, iRegI_R4 cnt, instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -15588,13 +15660,17 @@ instruct array_equalsB(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, instruct array_equalsC(iRegP_R1 ary1, iRegP_R2 ary2, iRegI_R0 result, iRegP_R3 tmp1, iRegP_R4 tmp2, iRegP_R5 tmp3, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, vRegD_V7 vtmp7, iRegP_R10 tmp, rFlagsReg cr) %{ predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU); match(Set result (AryEq ary1 ary2)); - effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, KILL cr); + effect(KILL tmp, USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP tmp3, + TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + TEMP vtmp6, TEMP vtmp7, KILL cr); - format %{ "Array Equals $ary1,ary2 -> $result // KILL $tmp" %} + format %{ "Array Equals $ary1,ary2 -> $result # KILL $ary1 $ary2 $tmp $tmp1 $tmp2 $tmp3 V0-V7 cr" %} ins_encode %{ address tpc = __ arrays_equals($ary1$$Register, $ary2$$Register, $tmp1$$Register, $tmp2$$Register, $tmp3$$Register, @@ -15624,35 +15700,40 @@ instruct has_negatives(iRegP_R1 ary1, iRegI_R2 len, iRegI_R0 result, rFlagsReg c // fast char[] to byte[] compression instruct string_compress(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, - vRegD_V2 tmp3, vRegD_V3 tmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (StrCompressedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, TEMP vtmp4, TEMP vtmp5, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Compress $src,$dst -> $result // KILL R1, R2, R3, R4" %} + format %{ "String Compress $src,$dst -> $result # KILL $src $dst $len V0-V5 cr" %} ins_encode %{ __ char_array_compress($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$FloatRegister, + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister, $result$$Register); %} ins_pipe( pipe_slow ); %} // fast byte[] to char[] inflation -instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, - vRegD_V0 tmp1, vRegD_V1 tmp2, vRegD_V2 tmp3, iRegP_R3 tmp4, rFlagsReg cr) +instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len, iRegP_R3 tmp, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, vRegD_V3 vtmp3, + vRegD_V4 vtmp4, vRegD_V5 vtmp5, vRegD_V6 vtmp6, rFlagsReg cr) %{ match(Set dummy (StrInflatedCopy src (Binary dst len))); - effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); + effect(TEMP vtmp0, TEMP vtmp1, TEMP vtmp2, TEMP vtmp3, + TEMP vtmp4, TEMP vtmp5, TEMP vtmp6, TEMP tmp, + USE_KILL src, USE_KILL dst, USE_KILL len, KILL cr); - format %{ "String Inflate $src,$dst // KILL $tmp1, $tmp2" %} + format %{ "String Inflate $src,$dst # KILL $tmp $src $dst $len V0-V6 cr" %} ins_encode %{ address tpc = __ byte_array_inflate($src$$Register, $dst$$Register, $len$$Register, - $tmp1$$FloatRegister, $tmp2$$FloatRegister, - $tmp3$$FloatRegister, $tmp4$$Register); + $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $tmp$$Register); if (tpc == NULL) { ciEnv::current()->record_failure("CodeCache is full"); return; @@ -15663,19 +15744,20 @@ instruct string_inflate(Universe dummy, iRegP_R0 src, iRegP_R1 dst, iRegI_R2 len // encode char[] to byte[] in ISO_8859_1 instruct encode_iso_array(iRegP_R2 src, iRegP_R1 dst, iRegI_R3 len, - vRegD_V0 Vtmp1, vRegD_V1 Vtmp2, - vRegD_V2 Vtmp3, vRegD_V3 Vtmp4, + vRegD_V0 vtmp0, vRegD_V1 vtmp1, vRegD_V2 vtmp2, + vRegD_V3 vtmp3, vRegD_V4 vtmp4, vRegD_V5 vtmp5, iRegI_R0 result, rFlagsReg cr) %{ match(Set result (EncodeISOArray src (Binary dst len))); - effect(USE_KILL src, USE_KILL dst, USE_KILL len, - KILL Vtmp1, KILL Vtmp2, KILL Vtmp3, KILL Vtmp4, KILL cr); + effect(USE_KILL src, USE_KILL dst, USE_KILL len, KILL vtmp0, KILL vtmp1, + KILL vtmp2, KILL vtmp3, KILL vtmp4, KILL vtmp5, KILL cr); - format %{ "Encode array $src,$dst,$len -> $result" %} + format %{ "Encode array $src,$dst,$len -> $result # KILL $src $dst $len V0-V5 cr" %} ins_encode %{ __ encode_iso_array($src$$Register, $dst$$Register, $len$$Register, - $result$$Register, $Vtmp1$$FloatRegister, $Vtmp2$$FloatRegister, - $Vtmp3$$FloatRegister, $Vtmp4$$FloatRegister); + $result$$Register, $vtmp0$$FloatRegister, $vtmp1$$FloatRegister, + $vtmp2$$FloatRegister, $vtmp3$$FloatRegister, + $vtmp4$$FloatRegister, $vtmp5$$FloatRegister); %} ins_pipe( pipe_class_memory ); %} diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp index 5753cc9a611..7f329a45d30 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp @@ -4332,6 +4332,7 @@ void MacroAssembler::remove_frame(int framesize) { typedef void (MacroAssembler::* chr_insn)(Register Rt, const Address &adr); // Search for str1 in str2 and return index or -1 +// Clobbers: rscratch1, rscratch2, rflags. May also clobber v0-v1, when icnt1==-1. void MacroAssembler::string_indexof(Register str2, Register str1, Register cnt2, Register cnt1, Register tmp1, Register tmp2, @@ -5123,6 +5124,8 @@ address MacroAssembler::has_negatives(Register ary1, Register len, Register resu return pc(); } +// Clobbers: rscratch1, rscratch2, rflags +// May also clobber v0-v7 when (!UseSimpleArrayEquals && UseSIMDForArrayEquals) address MacroAssembler::arrays_equals(Register a1, Register a2, Register tmp3, Register tmp4, Register tmp5, Register result, Register cnt1, int elem_size) { @@ -5615,10 +5618,13 @@ void MacroAssembler::fill_words(Register base, Register cnt, Register value) // Intrinsic for sun/nio/cs/ISO_8859_1$Encoder.implEncodeISOArray and // java/lang/StringUTF16.compress. +// +// Clobbers: src, dst, res, rscratch1, rscratch2, rflags void MacroAssembler::encode_iso_array(Register src, Register dst, - Register len, Register result, - FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4) + Register len, Register result, + FloatRegister Vtmp1, FloatRegister Vtmp2, + FloatRegister Vtmp3, FloatRegister Vtmp4, + FloatRegister Vtmp5, FloatRegister Vtmp6) { Label DONE, SET_RESULT, NEXT_32, NEXT_32_PRFM, LOOP_8, NEXT_8, LOOP_1, NEXT_1, NEXT_32_START, NEXT_32_PRFM_START; @@ -5641,13 +5647,13 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); BIND(NEXT_32_PRFM_START); prfm(Address(src, SoftwarePrefetchHintDistance)); - orr(v4, T16B, Vtmp1, Vtmp2); - orr(v5, T16B, Vtmp3, Vtmp4); + orr(Vtmp5, T16B, Vtmp1, Vtmp2); + orr(Vtmp6, T16B, Vtmp3, Vtmp4); uzp1(Vtmp1, T16B, Vtmp1, Vtmp2); uzp1(Vtmp3, T16B, Vtmp3, Vtmp4); - uzp2(v5, T16B, v4, v5); // high bytes - umov(tmp2, v5, D, 1); - fmovd(tmp1, v5); + uzp2(Vtmp6, T16B, Vtmp5, Vtmp6); // high bytes + umov(tmp2, Vtmp6, D, 1); + fmovd(tmp1, Vtmp6); orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); stpq(Vtmp1, Vtmp3, dst); @@ -5666,8 +5672,8 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, ld1(Vtmp1, Vtmp2, Vtmp3, Vtmp4, T8H, src); } prfm(Address(src, SoftwarePrefetchHintDistance)); - uzp1(v4, T16B, Vtmp1, Vtmp2); - uzp1(v5, T16B, Vtmp3, Vtmp4); + uzp1(Vtmp5, T16B, Vtmp1, Vtmp2); + uzp1(Vtmp6, T16B, Vtmp3, Vtmp4); orr(Vtmp1, T16B, Vtmp1, Vtmp2); orr(Vtmp3, T16B, Vtmp3, Vtmp4); uzp2(Vtmp1, T16B, Vtmp1, Vtmp3); // high bytes @@ -5675,7 +5681,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, fmovd(tmp1, Vtmp1); orr(tmp1, tmp1, tmp2); cbnz(tmp1, LOOP_8); - stpq(v4, v5, dst); + stpq(Vtmp5, Vtmp6, dst); sub(len, len, 32); add(dst, dst, 32); add(src, src, 64); @@ -5720,6 +5726,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst, // Inflate byte[] array to char[]. +// Clobbers: src, dst, len, rflags, rscratch1, v0-v6 address MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, FloatRegister vtmp1, FloatRegister vtmp2, FloatRegister vtmp3, Register tmp4) { @@ -5828,9 +5835,10 @@ address MacroAssembler::byte_array_inflate(Register src, Register dst, Register void MacroAssembler::char_array_compress(Register src, Register dst, Register len, FloatRegister tmp1Reg, FloatRegister tmp2Reg, FloatRegister tmp3Reg, FloatRegister tmp4Reg, + FloatRegister tmp5Reg, FloatRegister tmp6Reg, Register result) { encode_iso_array(src, dst, len, result, - tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg); + tmp1Reg, tmp2Reg, tmp3Reg, tmp4Reg, tmp5Reg, tmp6Reg); cmp(len, zr); csel(result, result, zr, EQ); } diff --git a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp index 7e23c16a442..01fdf16a01c 100644 --- a/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp +++ b/src/hotspot/cpu/aarch64/macroAssembler_aarch64.hpp @@ -1245,12 +1245,14 @@ class MacroAssembler: public Assembler { void char_array_compress(Register src, Register dst, Register len, FloatRegister tmp1Reg, FloatRegister tmp2Reg, FloatRegister tmp3Reg, FloatRegister tmp4Reg, + FloatRegister tmp5Reg, FloatRegister tmp6Reg, Register result); void encode_iso_array(Register src, Register dst, Register len, Register result, FloatRegister Vtmp1, FloatRegister Vtmp2, - FloatRegister Vtmp3, FloatRegister Vtmp4); + FloatRegister Vtmp3, FloatRegister Vtmp4, + FloatRegister Vtmp5, FloatRegister Vtmp6); void string_indexof(Register str1, Register str2, Register cnt1, Register cnt2, Register tmp1, Register tmp2, diff --git a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp index bd4b5d7c13f..482784d6b7b 100644 --- a/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp +++ b/src/hotspot/cpu/aarch64/stubGenerator_aarch64.cpp @@ -4099,6 +4099,7 @@ class StubGenerator: public StubCodeGenerator { // result = r0 - return value. Already contains "false" // cnt1 = r10 - amount of elements left to check, reduced by wordSize // r3-r5 are reserved temporary registers + // Clobbers: v0-v7 when UseSIMDForArrayEquals, rscratch1, rscratch2 address generate_large_array_equals() { Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, @@ -4503,6 +4504,8 @@ class StubGenerator: public StubCodeGenerator { // R2 = cnt1 // R3 = str1 // R4 = cnt2 + // Clobbers: rscratch1, rscratch2, v0, v1, rflags + // // This generic linear code use few additional ideas, which makes it faster: // 1) we can safely keep at least 1st register of pattern(since length >= 8) // in order to skip initial loading(help in systems with 1 ld pipeline) @@ -4817,6 +4820,7 @@ class StubGenerator: public StubCodeGenerator { // R3 = len >> 3 // V0 = 0 // v1 = loaded 8 bytes + // Clobbers: r0, r1, r3, rscratch1, rflags, v0-v6 address generate_large_byte_array_inflate() { __ align(CodeEntryAlignment); StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); diff --git a/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java new file mode 100644 index 00000000000..960661b975a --- /dev/null +++ b/test/hotspot/jtreg/compiler/c2/aarch64/TestIntrinsicsRegStress.java @@ -0,0 +1,296 @@ +/* + * Copyright (c) 2023, Arm Limited. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/* + * @test + * @bug 8307572 + * @summary Verify vector register clobbering in some aarch64 intrinsics + * @library /compiler/patches /test/lib + * @build java.base/java.lang.Helper + * @run main/othervm -Xbatch -XX:CompileThreshold=100 -XX:-TieredCompilation compiler.c2.aarch64.TestIntrinsicsRegStress + */ + +package compiler.c2.aarch64; + +import java.util.Arrays; + +public class TestIntrinsicsRegStress { + + final int LENGTH = 1024; + final int ITER = 10000; + final int NUM = 32; + + byte[] ba; + char[] ca; + char[] cb; + float[] fv; + + String str; + String[] strings; + String needle = "01234567890123456789"; + + public void init() { + ca = new char[LENGTH]; + fv = new float[NUM]; + strings = new String[NUM]; + for (int i = 0; i < LENGTH; i++) { + ca[i] = (char) ('a' + i % NUM); + } + cb = ca.clone(); + str = new String(ca); + for (int i = 0; i < NUM; i++) { + fv[i] = 1; + } + for (int i = 0; i < NUM; i++) { + strings[i] = str.substring(i) + needle; + } + } + + public void checkIndexOf(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + int result = strings[iter % NUM].indexOf(needle); + + if (result > LENGTH - NUM / 2) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testIndexOf() { + for (int i = 0; i < ITER; i++) { + checkIndexOf(i); + } + } + + public void checkArraysEquals() { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + if (Arrays.equals(ca, cb)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testArraysEquals() { + for (int i = 0; i < ITER; i++) { + checkArraysEquals(); + } + } + + public void checkCompress(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + ba = Helper.compressChar(ca, 0, LENGTH, 0, LENGTH); + + if (ba[iter % LENGTH] > (byte) ('a' + 5)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testCompress() { + for (int i = 0; i < ITER; i++) { + checkCompress(i); + } + } + + public void checkInflate(int iter) { + float t0 = 0; + float t1 = fv[1] * fv[0]; + float t2 = fv[2] * fv[0]; + float t3 = fv[3] * fv[0]; + float t4 = fv[4] * fv[0]; + float t5 = fv[5] * fv[0]; + float t6 = fv[6] * fv[0]; + float t7 = fv[7] * fv[0]; + float t8 = fv[8] * fv[0]; + float t9 = fv[9] * fv[0]; + float t10 = fv[10] * fv[0]; + float t11 = fv[11] * fv[0]; + float t12 = fv[12] * fv[0]; + float t13 = fv[13] * fv[0]; + float t14 = fv[14] * fv[0]; + float t15 = fv[15] * fv[0]; + float t16 = fv[16] * fv[0]; + float t17 = fv[17] * fv[0]; + float t18 = fv[18] * fv[0]; + float t19 = fv[19] * fv[0]; + float t20 = fv[20] * fv[0]; + float t21 = fv[21] * fv[0]; + float t22 = fv[22] * fv[0]; + float t23 = fv[23] * fv[0]; + float t24 = fv[24] * fv[0]; + float t25 = fv[25] * fv[0]; + float t26 = fv[26] * fv[0]; + float t27 = fv[27] * fv[0]; + float t28 = fv[28] * fv[0]; + float t29 = fv[29] * fv[0]; + float t30 = fv[30] * fv[0]; + + str.getChars(0, LENGTH, ca, 0); + + if (ca[iter % LENGTH] > (byte) ('a' + NUM / 2)) { + // Use fp registers as many as possible and try to make them + // live across above intrinsic function. + t0 += t1 - t2 + t3 - t4 + t5 - t6 + t7 - t8 + t9 - t10 + t11 - t12 + t13 - t14 + t15 + - t16 + t17 - t18 + t19 - t20 + t21 - t22 + t23 - t24 + t25 - t26 + t27 - t28 + + t29 - t30; // 0 + } + fv[31] += t0 + t2 - t11 + t16 - t29; + } + + public void testInflate() { + for (int i = 0; i < ITER; i++) { + checkInflate(i); + } + } + + public void verifyAndReset() { + if (fv[31] != 1.0) { + throw new RuntimeException("Failed with " + Float.toString(fv[31])); + } else { + System.out.println("Success!"); + } + fv[31] = 1.0f; + } + + public static void main(String[] args) { + TestIntrinsicsRegStress t = new TestIntrinsicsRegStress(); + t.init(); + + t.testIndexOf(); + t.verifyAndReset(); + + t.testArraysEquals(); + t.verifyAndReset(); + + t.testCompress(); + t.verifyAndReset(); + + t.testInflate(); + t.verifyAndReset(); + } +}