Skip to content

Commit

Permalink
8290034: Auto vectorize reverse bit operations.
Browse files Browse the repository at this point in the history
Reviewed-by: xgong, kvn
  • Loading branch information
Jatin Bhateja committed Jul 28, 2022
1 parent 348a052 commit 5d82d67
Show file tree
Hide file tree
Showing 18 changed files with 425 additions and 0 deletions.
8 changes: 8 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Expand Up @@ -10115,6 +10115,14 @@ void Assembler::evpternlogq(XMMRegister dst, int imm8, KRegister mask, XMMRegist
emit_int8(imm8);
}

void Assembler::gf2p8affineqb(XMMRegister dst, XMMRegister src, int imm8) {
assert(VM_Version::supports_gfni(), "");
assert(VM_Version::supports_sse(), "");
InstructionAttr attributes(AVX_128bit, /* rex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ false);
int encode = simd_prefix_and_encode(dst, dst, src, VEX_SIMD_66, VEX_OPCODE_0F_3A, &attributes);
emit_int24((unsigned char)0xCE, (unsigned char)(0xC0 | encode), imm8);
}

void Assembler::vgf2p8affineqb(XMMRegister dst, XMMRegister src2, XMMRegister src3, int imm8, int vector_len) {
assert(VM_Version::supports_gfni(), "requires GFNI support");
assert(VM_Version::supports_sse(), "");
Expand Down
1 change: 1 addition & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Expand Up @@ -2801,6 +2801,7 @@ class Assembler : public AbstractAssembler {
void evpblendmq(XMMRegister dst, KRegister mask, XMMRegister nds, XMMRegister src, bool merge, int vector_len);

// Galois field affine transformation instructions.
void gf2p8affineqb(XMMRegister dst, XMMRegister src, int imm8);
void vgf2p8affineqb(XMMRegister dst, XMMRegister src2, XMMRegister src3, int imm8, int vector_len);

protected:
Expand Down
84 changes: 84 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.cpp
Expand Up @@ -5484,6 +5484,90 @@ void C2_MacroAssembler::udivmodI(Register rax, Register divisor, Register rdx, R
}

#ifdef _LP64
void C2_MacroAssembler::reverseI(Register dst, Register src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp) {
if(VM_Version::supports_gfni()) {
// Galois field instruction based bit reversal based on following algorithm.
// http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
mov64(rtmp, 0x8040201008040201L);
movq(xtmp1, src);
movq(xtmp2, rtmp);
gf2p8affineqb(xtmp1, xtmp2, 0);
movq(dst, xtmp1);
} else {
// Swap even and odd numbered bits.
movl(rtmp, src);
andl(rtmp, 0x55555555);
shll(rtmp, 1);
movl(dst, src);
andl(dst, 0xAAAAAAAA);
shrl(dst, 1);
orl(dst, rtmp);

// Swap LSB and MSB 2 bits of each nibble.
movl(rtmp, dst);
andl(rtmp, 0x33333333);
shll(rtmp, 2);
andl(dst, 0xCCCCCCCC);
shrl(dst, 2);
orl(dst, rtmp);

// Swap LSB and MSB 4 bits of each byte.
movl(rtmp, dst);
andl(rtmp, 0x0F0F0F0F);
shll(rtmp, 4);
andl(dst, 0xF0F0F0F0);
shrl(dst, 4);
orl(dst, rtmp);
}
bswapl(dst);
}

void C2_MacroAssembler::reverseL(Register dst, Register src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp1, Register rtmp2) {
if(VM_Version::supports_gfni()) {
// Galois field instruction based bit reversal based on following algorithm.
// http://0x80.pl/articles/avx512-galois-field-for-bit-shuffling.html
mov64(rtmp1, 0x8040201008040201L);
movq(xtmp1, src);
movq(xtmp2, rtmp1);
gf2p8affineqb(xtmp1, xtmp2, 0);
movq(dst, xtmp1);
} else {
// Swap even and odd numbered bits.
movq(rtmp1, src);
mov64(rtmp2, 0x5555555555555555L);
andq(rtmp1, rtmp2);
shlq(rtmp1, 1);
movq(dst, src);
notq(rtmp2);
andq(dst, rtmp2);
shrq(dst, 1);
orq(dst, rtmp1);

// Swap LSB and MSB 2 bits of each nibble.
movq(rtmp1, dst);
mov64(rtmp2, 0x3333333333333333L);
andq(rtmp1, rtmp2);
shlq(rtmp1, 2);
notq(rtmp2);
andq(dst, rtmp2);
shrq(dst, 2);
orq(dst, rtmp1);

// Swap LSB and MSB 4 bits of each byte.
movq(rtmp1, dst);
mov64(rtmp2, 0x0F0F0F0F0F0F0F0FL);
andq(rtmp1, rtmp2);
shlq(rtmp1, 4);
notq(rtmp2);
andq(dst, rtmp2);
shrq(dst, 4);
orq(dst, rtmp1);
}
bswapq(dst);
}

void C2_MacroAssembler::udivL(Register rax, Register divisor, Register rdx) {
Label done;
Label neg_divisor_fastpath;
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/cpu/x86/c2_MacroAssembler_x86.hpp
Expand Up @@ -368,6 +368,10 @@
void udivmodI(Register rax, Register divisor, Register rdx, Register tmp);

#ifdef _LP64
void reverseI(Register dst, Register src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp);
void reverseL(Register dst, Register src, XMMRegister xtmp1,
XMMRegister xtmp2, Register rtmp1, Register rtmp2);
void udivL(Register rax, Register divisor, Register rdx);
void umodL(Register rax, Register divisor, Register rdx);
void udivmodL(Register rax, Register divisor, Register rdx, Register tmp);
Expand Down
44 changes: 44 additions & 0 deletions src/hotspot/cpu/x86/x86_64.ad
Expand Up @@ -6721,6 +6721,50 @@ instruct countTrailingZerosL_bsf(rRegI dst, rRegL src, rFlagsReg cr) %{
ins_pipe(ialu_reg);
%}

//--------------- Reverse Operation Instructions ----------------
instruct bytes_reversebit_int(rRegI dst, rRegI src, rRegI rtmp, rFlagsReg cr) %{
predicate(!VM_Version::supports_gfni());
match(Set dst (ReverseI src));
effect(TEMP dst, TEMP rtmp, KILL cr);
format %{ "reverse_int $dst $src\t! using $rtmp as TEMP" %}
ins_encode %{
__ reverseI($dst$$Register, $src$$Register, xnoreg, xnoreg, $rtmp$$Register);
%}
ins_pipe( ialu_reg );
%}

instruct bytes_reversebit_int_gfni(rRegI dst, rRegI src, regF xtmp1, regF xtmp2, rRegL rtmp, rFlagsReg cr) %{
predicate(VM_Version::supports_gfni());
match(Set dst (ReverseI src));
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp, KILL cr);
format %{ "reverse_int $dst $src\t! using $rtmp, $xtmp1 and $xtmp2 as TEMP" %}
ins_encode %{
__ reverseI($dst$$Register, $src$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $rtmp$$Register);
%}
ins_pipe( ialu_reg );
%}

instruct bytes_reversebit_long(rRegL dst, rRegL src, rRegL rtmp1, rRegL rtmp2, rFlagsReg cr) %{
predicate(!VM_Version::supports_gfni());
match(Set dst (ReverseL src));
effect(TEMP dst, TEMP rtmp1, TEMP rtmp2, KILL cr);
format %{ "reverse_long $dst $src\t! using $rtmp1 and $rtmp2 as TEMP" %}
ins_encode %{
__ reverseL($dst$$Register, $src$$Register, xnoreg, xnoreg, $rtmp1$$Register, $rtmp2$$Register);
%}
ins_pipe( ialu_reg );
%}

instruct bytes_reversebit_long_gfni(rRegL dst, rRegL src, regD xtmp1, regD xtmp2, rRegL rtmp, rFlagsReg cr) %{
predicate(VM_Version::supports_gfni());
match(Set dst (ReverseL src));
effect(TEMP dst, TEMP xtmp1, TEMP xtmp2, TEMP rtmp, KILL cr);
format %{ "reverse_long $dst $src\t! using $rtmp, $xtmp1 and $xtmp2 as TEMP" %}
ins_encode %{
__ reverseL($dst$$Register, $src$$Register, $xtmp1$$XMMRegister, $xtmp2$$XMMRegister, $rtmp$$Register, noreg);
%}
ins_pipe( ialu_reg );
%}

//---------- Population Count Instructions -------------------------------------

Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/share/classfile/vmIntrinsics.hpp
Expand Up @@ -246,6 +246,9 @@ class methodHandle;
do_intrinsic(_expand_i, java_lang_Integer, expand_name, int2_int_signature, F_S) \
do_intrinsic(_expand_l, java_lang_Long, expand_name, long2_long_signature, F_S) \
\
do_intrinsic(_reverse_i, java_lang_Integer, reverse_name, int_int_signature, F_S) \
do_name( reverse_name, "reverse") \
do_intrinsic(_reverse_l, java_lang_Long, reverse_name, long_long_signature, F_S) \
do_intrinsic(_reverseBytes_i, java_lang_Integer, reverseBytes_name, int_int_signature, F_S) \
do_name( reverseBytes_name, "reverseBytes") \
do_intrinsic(_reverseBytes_l, java_lang_Long, reverseBytes_name, long_long_signature, F_S) \
Expand Down
6 changes: 6 additions & 0 deletions src/hotspot/share/opto/c2compiler.cpp
Expand Up @@ -263,6 +263,12 @@ bool C2Compiler::is_intrinsic_supported(const methodHandle& method, bool is_virt
case vmIntrinsics::_numberOfTrailingZeros_l:
if (!Matcher::match_rule_supported(Op_CountTrailingZerosL)) return false;
break;
case vmIntrinsics::_reverse_i:
if (!Matcher::match_rule_supported(Op_ReverseI)) return false;
break;
case vmIntrinsics::_reverse_l:
if (!Matcher::match_rule_supported(Op_ReverseL)) return false;
break;
case vmIntrinsics::_reverseBytes_c:
if (!Matcher::match_rule_supported(Op_ReverseBytesUS)) return false;
break;
Expand Down
4 changes: 4 additions & 0 deletions src/hotspot/share/opto/library_call.cpp
Expand Up @@ -525,6 +525,8 @@ bool LibraryCallKit::try_to_inline(int predicate) {
case vmIntrinsics::_numberOfTrailingZeros_l:
case vmIntrinsics::_bitCount_i:
case vmIntrinsics::_bitCount_l:
case vmIntrinsics::_reverse_i:
case vmIntrinsics::_reverse_l:
case vmIntrinsics::_reverseBytes_i:
case vmIntrinsics::_reverseBytes_l:
case vmIntrinsics::_reverseBytes_s:
Expand Down Expand Up @@ -2060,6 +2062,8 @@ bool LibraryCallKit::inline_number_methods(vmIntrinsics::ID id) {
case vmIntrinsics::_reverseBytes_s: n = new ReverseBytesSNode( 0, arg); break;
case vmIntrinsics::_reverseBytes_i: n = new ReverseBytesINode( 0, arg); break;
case vmIntrinsics::_reverseBytes_l: n = new ReverseBytesLNode( 0, arg); break;
case vmIntrinsics::_reverse_i: n = new ReverseINode(0, arg); break;
case vmIntrinsics::_reverse_l: n = new ReverseLNode(0, arg); break;
default: fatal_unexpected_iid(id); break;
}
set_result(_gvn.transform(n));
Expand Down
47 changes: 47 additions & 0 deletions src/hotspot/share/opto/subnode.cpp
Expand Up @@ -1899,3 +1899,50 @@ const Type* SqrtFNode::Value(PhaseGVN* phase) const {
if( f < 0.0f ) return Type::FLOAT;
return TypeF::make( (float)sqrt( (double)f ) );
}

static jlong reverse_bits(jlong val) {
jlong res = ((val & 0xF0F0F0F0F0F0F0F0L) >> 4) | ((val & 0x0F0F0F0F0F0F0F0F) << 4);
res = ((res & 0xCCCCCCCCCCCCCCCCL) >> 2) | ((res & 0x3333333333333333L) << 2);
res = ((res & 0xAAAAAAAAAAAAAAAAL) >> 1) | ((res & 0x5555555555555555L) << 1);
return res;
}

const Type* ReverseINode::Value(PhaseGVN* phase) const {
const Type *t1 = phase->type( in(1) );
if (t1 == Type::TOP) {
return Type::TOP;
}
const TypeInt* t1int = t1->isa_int();
if (t1int && t1int->is_con()) {
jint res = reverse_bits(t1int->get_con());
return TypeInt::make(res);
}
return t1int;
}

const Type* ReverseLNode::Value(PhaseGVN* phase) const {
const Type *t1 = phase->type( in(1) );
if (t1 == Type::TOP) {
return Type::TOP;
}
const TypeLong* t1long = t1->isa_long();
if (t1long && t1long->is_con()) {
jint res = reverse_bits(t1long->get_con());
return TypeLong::make(res);
}
return t1long;
}

Node* ReverseINode::Identity(PhaseGVN* phase) {
if (in(1)->Opcode() == Op_ReverseI) {
return in(1)->in(1);
}
return this;
}

Node* ReverseLNode::Identity(PhaseGVN* phase) {
if (in(1)->Opcode() == Op_ReverseL) {
return in(1)->in(1);
}
return this;
}
4 changes: 4 additions & 0 deletions src/hotspot/share/opto/subnode.hpp
Expand Up @@ -580,6 +580,8 @@ class ReverseINode : public Node {
virtual int Opcode() const;
const Type *bottom_type() const { return TypeInt::INT; }
virtual uint ideal_reg() const { return Op_RegI; }
virtual Node* Identity(PhaseGVN* phase);
virtual const Type* Value(PhaseGVN* phase) const;
};

//-------------------------------ReverseLNode--------------------------------
Expand All @@ -590,6 +592,8 @@ class ReverseLNode : public Node {
virtual int Opcode() const;
const Type *bottom_type() const { return TypeLong::LONG; }
virtual uint ideal_reg() const { return Op_RegL; }
virtual Node* Identity(PhaseGVN* phase);
virtual const Type* Value(PhaseGVN* phase) const;
};

#endif // SHARE_OPTO_SUBNODE_HPP
1 change: 1 addition & 0 deletions src/hotspot/share/opto/superword.cpp
Expand Up @@ -2645,6 +2645,7 @@ bool SuperWord::output() {
opc == Op_PopCountI || opc == Op_PopCountL ||
opc == Op_ReverseBytesI || opc == Op_ReverseBytesL ||
opc == Op_ReverseBytesUS || opc == Op_ReverseBytesS ||
opc == Op_ReverseI || opc == Op_ReverseL ||
opc == Op_CountLeadingZerosI || opc == Op_CountLeadingZerosL ||
opc == Op_CountTrailingZerosI || opc == Op_CountTrailingZerosL) {
assert(n->req() == 2, "only one input expected");
Expand Down
1 change: 1 addition & 0 deletions src/java.base/share/classes/java/lang/Integer.java
Expand Up @@ -1762,6 +1762,7 @@ public static int rotateRight(int i, int distance) {
* specified {@code int} value.
* @since 1.5
*/
@IntrinsicCandidate
public static int reverse(int i) {
// HD, Figure 7-1
i = (i & 0x55555555) << 1 | (i >>> 1) & 0x55555555;
Expand Down
1 change: 1 addition & 0 deletions src/java.base/share/classes/java/lang/Long.java
Expand Up @@ -1901,6 +1901,7 @@ public static long rotateRight(long i, int distance) {
* specified {@code long} value.
* @since 1.5
*/
@IntrinsicCandidate
public static long reverse(long i) {
// HD, Figure 7-1
i = (i & 0x5555555555555555L) << 1 | (i >>> 1) & 0x5555555555555555L;
Expand Down
17 changes: 17 additions & 0 deletions test/hotspot/jtreg/compiler/c2/cr6340864/TestIntVect.java
Expand Up @@ -461,6 +461,10 @@ static int test() {
for (int i=0; i<ARRLEN; i++) {
errn += verify("test_reverse_bytes: ", i, a0[i], Integer.reverseBytes(a1[i]));
}
test_reverse(a0, a1);
for (int i=0; i<ARRLEN; i++) {
errn += verify("test_reverse: ", i, a0[i], Integer.reverse(a1[i]));
}

test_pack2(p2, a1);
for (int i=0; i<ARRLEN/2; i++) {
Expand Down Expand Up @@ -934,6 +938,13 @@ static int test() {
end = System.currentTimeMillis();
System.out.println("test_reverse_bytes: " + (end - start));

start = System.currentTimeMillis();
for (int i=0; i<ITERS; i++) {
test_reverse(a0, a1);
}
end = System.currentTimeMillis();
System.out.println("test_reverse: " + (end - start));

start = System.currentTimeMillis();
for (int i=0; i<ITERS; i++) {
test_pack2(p2, a1);
Expand Down Expand Up @@ -1287,6 +1298,12 @@ static void test_reverse_bytes(int [] a0, int [] a1) {
}
}

static void test_reverse(int [] a0, int [] a1) {
for(int i = 0; i < a0.length; i++) {
a0[i] = Integer.reverse(a1[i]);
}
}

static int verify(String text, int i, int elem, int val) {
if (elem != val) {
System.err.println(text + "[" + i + "] = " + elem + " != " + val);
Expand Down
17 changes: 17 additions & 0 deletions test/hotspot/jtreg/compiler/c2/cr6340864/TestLongVect.java
Expand Up @@ -436,6 +436,10 @@ static int test() {
for (int i=0; i<ARRLEN; i++) {
errn += verify("test_reverse_bytes: ", i, a0[i], Long.reverseBytes(a1[i]));
}
test_reverse(a0, a1);
for (int i=0; i<ARRLEN; i++) {
errn += verify("test_reverse: ", i, a0[i], Long.reverse(a1[i]));
}
}

if (errn > 0)
Expand Down Expand Up @@ -863,6 +867,12 @@ static int test() {
end = System.currentTimeMillis();
System.out.println("test_reverse_bytes: " + (end - start));

start = System.currentTimeMillis();
for (int i=0; i<ITERS; i++) {
test_reverse(a0, a1);
}
end = System.currentTimeMillis();
System.out.println("test_reverse: " + (end - start));
return errn;
}

Expand Down Expand Up @@ -1133,12 +1143,19 @@ static void test_srav_and(long[] a0, long[] a1, long b) {
a0[i] = (long)((a1[i] & b)>>VALUE);
}
}

static void test_reverse_bytes(long[] a0, long[] a1) {
for(int i = 0; i < a0.length; i++) {
a0[i] = Long.reverseBytes(a1[i]);
}
}

static void test_reverse(long[] a0, long[] a1) {
for(int i = 0; i < a0.length; i++) {
a0[i] = Long.reverse(a1[i]);
}
}

static int verify(String text, int i, long elem, long val) {
if (elem != val) {
System.err.println(text + "[" + i + "] = " + elem + " != " + val);
Expand Down

1 comment on commit 5d82d67

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.