Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8271883: Math CopySign optimization for x86 #5005

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
15 changes: 15 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.cpp
Expand Up @@ -2729,6 +2729,21 @@ void Assembler::movdqu(Address dst, XMMRegister src) {
emit_operand(src, dst);
}

void Assembler::vmovd(XMMRegister dst, Register src) {
assert(UseAVX > 0, "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ false, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6E, (0xC0 | encode));
}

void Assembler::vmovq(XMMRegister dst, Register src) {
assert(UseAVX > 0, "");
InstructionAttr attributes(AVX_128bit, /* vex_w */ true, /* legacy_mode */ false, /* no_mask_reg */ true, /* uses_vl */ true);
int encode = vex_prefix_and_encode(dst->encoding(), 0, src->encoding(), VEX_SIMD_66, VEX_OPCODE_0F, &attributes);
emit_int16(0x6E, (0xC0 | encode));
}


// Move Unaligned 256bit Vector
void Assembler::vmovdqu(XMMRegister dst, XMMRegister src) {
assert(UseAVX > 0, "");
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/assembler_x86.hpp
Expand Up @@ -1508,6 +1508,9 @@ class Assembler : public AbstractAssembler {
void movdqu(XMMRegister dst, Address src);
void movdqu(XMMRegister dst, XMMRegister src);

void vmovd(XMMRegister dst, Register src);
void vmovq(XMMRegister dst, Register src);

// Move Unaligned 256bit Vector
void vmovdqu(Address dst, XMMRegister src);
void vmovdqu(XMMRegister dst, Address src);
Expand Down
3 changes: 3 additions & 0 deletions src/hotspot/cpu/x86/vm_version_x86.cpp
Expand Up @@ -1736,6 +1736,9 @@ void VM_Version::get_processor_features() {
if (FLAG_IS_DEFAULT(UseSignumIntrinsic)) {
FLAG_SET_DEFAULT(UseSignumIntrinsic, true);
}
if (FLAG_IS_DEFAULT(UseCopySignIntrinsic)) {
FLAG_SET_DEFAULT(UseCopySignIntrinsic, true);
}
}

void VM_Version::print_platform_virtualization_info(outputStream* st) {
Expand Down
70 changes: 69 additions & 1 deletion src/hotspot/cpu/x86/x86.ad
Expand Up @@ -1560,6 +1560,15 @@ const bool Matcher::match_rule_supported(int opcode) {
return false;
}
break;
case Op_CopySignD:
case Op_CopySignF:
if (UseAVX < 3 || !is_LP64) {
return false;
}
if (!VM_Version::supports_avx512vl()) {
return false;
}
break;
#ifndef _LP64
case Op_AddReductionVF:
case Op_AddReductionVD:
Expand Down Expand Up @@ -5773,7 +5782,7 @@ instruct evminmaxFP_reg_eavx(vec dst, vec a, vec b, vec atmp, vec btmp, kReg ktm
ins_pipe( pipe_slow );
%}

// --------------------------------- Signum ---------------------------
// --------------------------------- Signum/CopySign ---------------------------

instruct signumF_reg(regF dst, regF zero, regF one, rRegP scratch, rFlagsReg cr) %{
match(Set dst (SignumF dst (Binary zero one)));
Expand All @@ -5797,6 +5806,65 @@ instruct signumD_reg(regD dst, regD zero, regD one, rRegP scratch, rFlagsReg cr)
ins_pipe( pipe_slow );
%}

// ---------------------------------------
// For copySign use 0xE4 as writemask for vpternlog
// Desired Truth Table: A -> xmm0 bit, B -> xmm1 bit, C -> xmm2 bit
// Wherever xmm2 is 0, we want to pick from B (sign)
// Wherever xmm2 is 1, we want to pick from A (src)
mgkwill marked this conversation as resolved.
Show resolved Hide resolved
//
// A B C Result
// 0 0 0 0
// 0 0 1 0
// 0 1 0 1
// 0 1 1 0
// 1 0 0 0
// 1 0 1 1
// 1 1 0 1
// 1 1 1 1
//
// Result going from high bit to low bit is 0x11100100 = 0xe4
// ---------------------------------------

#ifdef _LP64
instruct copySignF_reg(regF dst, regF src, regF tmp1, rRegI tmp2) %{
match(Set dst (CopySignF dst src));
effect(TEMP tmp1, TEMP tmp2);
format %{ "CopySignF $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
ins_encode %{
__ movl($tmp2$$Register, 0x7FFFFFFF);
__ vmovd($tmp1$$XMMRegister, $tmp2$$Register);
mgkwill marked this conversation as resolved.
Show resolved Hide resolved
__ vpternlogd($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}

instruct copySignD_reg(regD dst, regD src, regD tmp1, rRegL tmp2, regD zero) %{
mgkwill marked this conversation as resolved.
Show resolved Hide resolved
match(Set dst (CopySignD dst (Binary src zero)));
ins_cost(125);
effect(TEMP tmp1, TEMP tmp2);
format %{ "CopySignD $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
ins_encode %{
__ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
__ vmovq($tmp1$$XMMRegister, $tmp2$$Register);
mgkwill marked this conversation as resolved.
Show resolved Hide resolved
__ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
%}
ins_pipe( pipe_slow );
%}

instruct copySignD_imm(regD dst, regD src, regD tmp1, rRegL tmp2, immD zero) %{
match(Set dst (CopySignD dst (Binary src zero)));
ins_cost(100);
effect(TEMP tmp1, TEMP tmp2);
format %{ "CopySignD $dst, $src\t! using $tmp1 and $tmp2 as TEMP" %}
ins_encode %{
__ mov64($tmp2$$Register, 0x7FFFFFFFFFFFFFFF);
__ vmovq($tmp1$$XMMRegister, $tmp2$$Register);
__ vpternlogq($dst$$XMMRegister, 0xE4, $src$$XMMRegister, $tmp1$$XMMRegister, Assembler::AVX_128bit);
mgkwill marked this conversation as resolved.
Show resolved Hide resolved
%}
ins_pipe( pipe_slow );
%}
#endif // _LP64

// --------------------------------- Sqrt --------------------------------------

instruct vsqrtF_reg(vec dst, vec src) %{
Expand Down
51 changes: 51 additions & 0 deletions test/micro/org/openjdk/bench/vm/compiler/Signum.java
Expand Up @@ -100,6 +100,16 @@ private static float Signum_Kernel(float data)
return Math.signum(data);
}

private static double Copysign_Kernel(double data, double sign)
{
return Math.copySign(data, sign);
}

private static float Copysign_Kernel(float data, float sign)
{
return Math.copySign(data, sign);
}

@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _1_signumFloatTest(Blackhole bh) {
Expand Down Expand Up @@ -139,4 +149,45 @@ public void _4_overheadDouble(Blackhole bh) {
}
}
}

@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _5_copySignFloatTest(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (float f : float_values) {
bh.consume(Copysign_Kernel(floatValue, f));
}
}
}

@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _6_overheadCopySignFloat(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (float f : float_values) {
bh.consume(f);
}
}
}

@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _7_copySignDoubleTest(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (double d : double_values) {
bh.consume(Copysign_Kernel(doubleValue, d));
}
}
}

@Benchmark
@OperationsPerInvocation(ITERATIONS * 17)
public void _8_overheadCopySignDouble(Blackhole bh) {
for (int i = 0; i < ITERATIONS; i++) {
for (double d : double_values) {
bh.consume(d);
}
}
}

}