@@ -8628,23 +8628,27 @@ void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Registe
8628
8628
#undef BLOCK_COMMENT
8629
8629
8630
8630
// Compress char[] array to byte[].
8631
- // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java
8631
+ // Intrinsic for java.lang.StringUTF16.compress(char[] src, int srcOff, byte[] dst, int dstOff, int len)
8632
+ // Return the array length if every element in array can be encoded,
8633
+ // otherwise, the index of first non-latin1 (> 0xff) character.
8632
8634
// @IntrinsicCandidate
8633
- // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8635
+ // public static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) {
8634
8636
// for (int i = 0; i < len; i++) {
8635
- // int c = src[srcOff++ ];
8636
- // if (c >>> 8 != 0 ) {
8637
- // return 0;
8637
+ // char c = src[srcOff];
8638
+ // if (c > 0xff ) {
8639
+ // return i; // return index of non-latin1 char
8638
8640
// }
8639
- // dst[dstOff++] = (byte)c;
8641
+ // dst[dstOff] = (byte)c;
8642
+ // srcOff++;
8643
+ // dstOff++;
8640
8644
// }
8641
8645
// return len;
8642
8646
// }
8643
8647
void MacroAssembler::char_array_compress (Register src, Register dst, Register len,
8644
8648
XMMRegister tmp1Reg, XMMRegister tmp2Reg,
8645
8649
XMMRegister tmp3Reg, XMMRegister tmp4Reg,
8646
8650
Register tmp5, Register result, KRegister mask1, KRegister mask2) {
8647
- Label copy_chars_loop, return_length, return_zero, done ;
8651
+ Label copy_chars_loop, done, reset_sp, copy_tail ;
8648
8652
8649
8653
// rsi: src
8650
8654
// rdi: dst
@@ -8659,28 +8663,28 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
8659
8663
assert (len != result, " " );
8660
8664
8661
8665
// save length for return
8662
- push ( len);
8666
+ movl (result, len);
8663
8667
8664
8668
if ((AVX3Threshold == 0 ) && (UseAVX > 2 ) && // AVX512
8665
8669
VM_Version::supports_avx512vlbw () &&
8666
8670
VM_Version::supports_bmi2 ()) {
8667
8671
8668
- Label copy_32_loop, copy_loop_tail, below_threshold;
8672
+ Label copy_32_loop, copy_loop_tail, below_threshold, reset_for_copy_tail ;
8669
8673
8670
8674
// alignment
8671
8675
Label post_alignment;
8672
8676
8673
- // if length of the string is less than 16 , handle it in an old fashioned way
8677
+ // if length of the string is less than 32 , handle it the old fashioned way
8674
8678
testl (len, -32 );
8675
8679
jcc (Assembler::zero, below_threshold);
8676
8680
8677
8681
// First check whether a character is compressible ( <= 0xFF).
8678
8682
// Create mask to test for Unicode chars inside zmm vector
8679
- movl (result , 0x00FF );
8680
- evpbroadcastw (tmp2Reg, result , Assembler::AVX_512bit);
8683
+ movl (tmp5 , 0x00FF );
8684
+ evpbroadcastw (tmp2Reg, tmp5 , Assembler::AVX_512bit);
8681
8685
8682
8686
testl (len, -64 );
8683
- jcc (Assembler::zero, post_alignment);
8687
+ jccb (Assembler::zero, post_alignment);
8684
8688
8685
8689
movl (tmp5, dst);
8686
8690
andl (tmp5, (32 - 1 ));
@@ -8689,18 +8693,19 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
8689
8693
8690
8694
// bail out when there is nothing to be done
8691
8695
testl (tmp5, 0xFFFFFFFF );
8692
- jcc (Assembler::zero, post_alignment);
8696
+ jccb (Assembler::zero, post_alignment);
8693
8697
8694
8698
// ~(~0 << len), where len is the # of remaining elements to process
8695
- movl (result, 0xFFFFFFFF );
8696
- shlxl (result, result, tmp5);
8697
- notl (result);
8698
- kmovdl (mask2, result);
8699
+ movl (len, 0xFFFFFFFF );
8700
+ shlxl (len, len, tmp5);
8701
+ notl (len);
8702
+ kmovdl (mask2, len);
8703
+ movl (len, result);
8699
8704
8700
8705
evmovdquw (tmp1Reg, mask2, Address (src, 0 ), /* merge*/ false , Assembler::AVX_512bit);
8701
8706
evpcmpw (mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /* signed*/ false , Assembler::AVX_512bit);
8702
8707
ktestd (mask1, mask2);
8703
- jcc (Assembler::carryClear, return_zero );
8708
+ jcc (Assembler::carryClear, copy_tail );
8704
8709
8705
8710
evpmovwb (Address (dst, 0 ), mask2, tmp1Reg, Assembler::AVX_512bit);
8706
8711
@@ -8715,7 +8720,7 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
8715
8720
movl (tmp5, len);
8716
8721
andl (tmp5, (32 - 1 )); // tail count (in chars)
8717
8722
andl (len, ~(32 - 1 )); // vector count (in chars)
8718
- jcc (Assembler::zero, copy_loop_tail);
8723
+ jccb (Assembler::zero, copy_loop_tail);
8719
8724
8720
8725
lea (src, Address (src, len, Address::times_2));
8721
8726
lea (dst, Address (dst, len, Address::times_1));
@@ -8725,55 +8730,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
8725
8730
evmovdquw (tmp1Reg, Address (src, len, Address::times_2), Assembler::AVX_512bit);
8726
8731
evpcmpuw (mask1, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit);
8727
8732
kortestdl (mask1, mask1);
8728
- jcc (Assembler::carryClear, return_zero );
8733
+ jccb (Assembler::carryClear, reset_for_copy_tail );
8729
8734
8730
8735
// All elements in current processed chunk are valid candidates for
8731
8736
// compression. Write a truncated byte elements to the memory.
8732
8737
evpmovwb (Address (dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit);
8733
8738
addptr (len, 32 );
8734
- jcc (Assembler::notZero, copy_32_loop);
8739
+ jccb (Assembler::notZero, copy_32_loop);
8735
8740
8736
8741
bind (copy_loop_tail);
8737
8742
// bail out when there is nothing to be done
8738
8743
testl (tmp5, 0xFFFFFFFF );
8739
- jcc (Assembler::zero, return_length );
8744
+ jcc (Assembler::zero, done );
8740
8745
8741
8746
movl (len, tmp5);
8742
8747
8743
8748
// ~(~0 << len), where len is the # of remaining elements to process
8744
- movl (result , 0xFFFFFFFF );
8745
- shlxl (result, result , len);
8746
- notl (result );
8749
+ movl (tmp5 , 0xFFFFFFFF );
8750
+ shlxl (tmp5, tmp5 , len);
8751
+ notl (tmp5 );
8747
8752
8748
- kmovdl (mask2, result );
8753
+ kmovdl (mask2, tmp5 );
8749
8754
8750
8755
evmovdquw (tmp1Reg, mask2, Address (src, 0 ), /* merge*/ false , Assembler::AVX_512bit);
8751
8756
evpcmpw (mask1, mask2, tmp1Reg, tmp2Reg, Assembler::le, /* signed*/ false , Assembler::AVX_512bit);
8752
8757
ktestd (mask1, mask2);
8753
- jcc (Assembler::carryClear, return_zero );
8758
+ jcc (Assembler::carryClear, copy_tail );
8754
8759
8755
8760
evpmovwb (Address (dst, 0 ), mask2, tmp1Reg, Assembler::AVX_512bit);
8756
- jmp (return_length);
8761
+ jmp (done);
8762
+
8763
+ bind (reset_for_copy_tail);
8764
+ lea (src, Address (src, tmp5, Address::times_2));
8765
+ lea (dst, Address (dst, tmp5, Address::times_1));
8766
+ subptr (len, tmp5);
8767
+ jmp (copy_chars_loop);
8757
8768
8758
8769
bind (below_threshold);
8759
8770
}
8760
8771
8761
8772
if (UseSSE42Intrinsics) {
8762
- Label copy_32_loop, copy_16, copy_tail ;
8773
+ Label copy_32_loop, copy_16, copy_tail_sse, reset_for_copy_tail ;
8763
8774
8764
- movl (result, len);
8775
+ // vectored compression
8776
+ testl (len, 0xfffffff8 );
8777
+ jcc (Assembler::zero, copy_tail);
8765
8778
8766
8779
movl (tmp5, 0xff00ff00 ); // create mask to test for Unicode chars in vectors
8780
+ movdl (tmp1Reg, tmp5);
8781
+ pshufd (tmp1Reg, tmp1Reg, 0 ); // store Unicode mask in tmp1Reg
8767
8782
8768
- // vectored compression
8769
- andl (len, 0xfffffff0 ); // vector count (in chars)
8770
- andl (result, 0x0000000f ); // tail count (in chars)
8771
- testl (len, len);
8772
- jcc (Assembler::zero, copy_16);
8783
+ andl (len, 0xfffffff0 );
8784
+ jccb (Assembler::zero, copy_16);
8773
8785
8774
8786
// compress 16 chars per iter
8775
- movdl (tmp1Reg, tmp5);
8776
- pshufd (tmp1Reg, tmp1Reg, 0 ); // store Unicode mask in tmp1Reg
8777
8787
pxor (tmp4Reg, tmp4Reg);
8778
8788
8779
8789
lea (src, Address (src, len, Address::times_2));
@@ -8786,59 +8796,60 @@ void MacroAssembler::char_array_compress(Register src, Register dst, Register le
8786
8796
movdqu (tmp3Reg, Address (src, len, Address::times_2, 16 )); // load next 8 characters
8787
8797
por (tmp4Reg, tmp3Reg);
8788
8798
ptest (tmp4Reg, tmp1Reg); // check for Unicode chars in next vector
8789
- jcc (Assembler::notZero, return_zero );
8799
+ jccb (Assembler::notZero, reset_for_copy_tail );
8790
8800
packuswb (tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte
8791
8801
movdqu (Address (dst, len, Address::times_1), tmp2Reg);
8792
8802
addptr (len, 16 );
8793
- jcc (Assembler::notZero, copy_32_loop);
8803
+ jccb (Assembler::notZero, copy_32_loop);
8794
8804
8795
8805
// compress next vector of 8 chars (if any)
8796
8806
bind (copy_16);
8797
- movl (len, result);
8798
- andl (len, 0xfffffff8 ); // vector count (in chars)
8799
- andl (result, 0x00000007 ); // tail count (in chars)
8800
- testl (len, len);
8801
- jccb (Assembler::zero, copy_tail);
8807
+ // len = 0
8808
+ testl (result, 0x00000008 ); // check if there's a block of 8 chars to compress
8809
+ jccb (Assembler::zero, copy_tail_sse);
8802
8810
8803
- movdl (tmp1Reg, tmp5);
8804
- pshufd (tmp1Reg, tmp1Reg, 0 ); // store Unicode mask in tmp1Reg
8805
8811
pxor (tmp3Reg, tmp3Reg);
8806
8812
8807
8813
movdqu (tmp2Reg, Address (src, 0 ));
8808
8814
ptest (tmp2Reg, tmp1Reg); // check for Unicode chars in vector
8809
- jccb (Assembler::notZero, return_zero );
8815
+ jccb (Assembler::notZero, reset_for_copy_tail );
8810
8816
packuswb (tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte
8811
8817
movq (Address (dst, 0 ), tmp2Reg);
8812
8818
addptr (src, 16 );
8813
8819
addptr (dst, 8 );
8820
+ jmpb (copy_tail_sse);
8814
8821
8815
- bind (copy_tail);
8822
+ bind (reset_for_copy_tail);
8823
+ movl (tmp5, result);
8824
+ andl (tmp5, 0x0000000f );
8825
+ lea (src, Address (src, tmp5, Address::times_2));
8826
+ lea (dst, Address (dst, tmp5, Address::times_1));
8827
+ subptr (len, tmp5);
8828
+ jmpb (copy_chars_loop);
8829
+
8830
+ bind (copy_tail_sse);
8816
8831
movl (len, result);
8832
+ andl (len, 0x00000007 ); // tail count (in chars)
8817
8833
}
8818
8834
// compress 1 char per iter
8835
+ bind (copy_tail);
8819
8836
testl (len, len);
8820
- jccb (Assembler::zero, return_length );
8837
+ jccb (Assembler::zero, done );
8821
8838
lea (src, Address (src, len, Address::times_2));
8822
8839
lea (dst, Address (dst, len, Address::times_1));
8823
8840
negptr (len);
8824
8841
8825
8842
bind (copy_chars_loop);
8826
- load_unsigned_short (result , Address (src, len, Address::times_2));
8827
- testl (result , 0xff00 ); // check if Unicode char
8828
- jccb (Assembler::notZero, return_zero );
8829
- movb (Address (dst, len, Address::times_1), result ); // ASCII char; compress to 1 byte
8843
+ load_unsigned_short (tmp5 , Address (src, len, Address::times_2));
8844
+ testl (tmp5 , 0xff00 ); // check if Unicode char
8845
+ jccb (Assembler::notZero, reset_sp );
8846
+ movb (Address (dst, len, Address::times_1), tmp5 ); // ASCII char; compress to 1 byte
8830
8847
increment (len);
8831
- jcc (Assembler::notZero, copy_chars_loop);
8848
+ jccb (Assembler::notZero, copy_chars_loop);
8832
8849
8833
- // if compression succeeded, return length
8834
- bind (return_length);
8835
- pop (result);
8836
- jmpb (done);
8837
-
8838
- // if compression failed, return 0
8839
- bind (return_zero);
8840
- xorl (result, result);
8841
- addptr (rsp, wordSize);
8850
+ // add len then return (len will be zero if compress succeeded, otherwise negative)
8851
+ bind (reset_sp);
8852
+ addl (result, len);
8842
8853
8843
8854
bind (done);
8844
8855
}
0 commit comments