Skip to content

Commit

Permalink
8318562: Computational test more than 2x slower when AVX instructions…
Browse files Browse the repository at this point in the history
… are used

Backport-of: 0881f2b0c43870ed10b1166d04cef9832e58629e
  • Loading branch information
Sandhya Viswanathan authored and shipilev committed Nov 21, 2023
1 parent e7bd0ab commit 0941f2c
Show file tree
Hide file tree
Showing 4 changed files with 247 additions and 2 deletions.
86 changes: 86 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1878,6 +1878,92 @@ void MacroAssembler::cmpoop(Register src1, jobject src2, Register rscratch) {
}
#endif

void MacroAssembler::cvtss2sd(XMMRegister dst, XMMRegister src) {
if ((UseAVX > 0) && (dst != src)) {
xorpd(dst, dst);
}
Assembler::cvtss2sd(dst, src);
}

void MacroAssembler::cvtss2sd(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorpd(dst, dst);
}
Assembler::cvtss2sd(dst, src);
}

void MacroAssembler::cvtsd2ss(XMMRegister dst, XMMRegister src) {
if ((UseAVX > 0) && (dst != src)) {
xorps(dst, dst);
}
Assembler::cvtsd2ss(dst, src);
}

void MacroAssembler::cvtsd2ss(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorps(dst, dst);
}
Assembler::cvtsd2ss(dst, src);
}

void MacroAssembler::cvtsi2sdl(XMMRegister dst, Register src) {
if (UseAVX > 0) {
xorpd(dst, dst);
}
Assembler::cvtsi2sdl(dst, src);
}

void MacroAssembler::cvtsi2sdl(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorpd(dst, dst);
}
Assembler::cvtsi2sdl(dst, src);
}

void MacroAssembler::cvtsi2ssl(XMMRegister dst, Register src) {
if (UseAVX > 0) {
xorps(dst, dst);
}
Assembler::cvtsi2ssl(dst, src);
}

void MacroAssembler::cvtsi2ssl(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorps(dst, dst);
}
Assembler::cvtsi2ssl(dst, src);
}

#ifdef _LP64
void MacroAssembler::cvtsi2sdq(XMMRegister dst, Register src) {
if (UseAVX > 0) {
xorpd(dst, dst);
}
Assembler::cvtsi2sdq(dst, src);
}

void MacroAssembler::cvtsi2sdq(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorpd(dst, dst);
}
Assembler::cvtsi2sdq(dst, src);
}

void MacroAssembler::cvtsi2ssq(XMMRegister dst, Register src) {
if (UseAVX > 0) {
xorps(dst, dst);
}
Assembler::cvtsi2ssq(dst, src);
}

void MacroAssembler::cvtsi2ssq(XMMRegister dst, Address src) {
if (UseAVX > 0) {
xorps(dst, dst);
}
Assembler::cvtsi2ssq(dst, src);
}
#endif // _LP64

void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch) {
assert(rscratch != noreg || always_reachable(adr), "missing");

Expand Down
17 changes: 17 additions & 0 deletions src/hotspot/cpu/x86/macroAssembler_x86.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -799,6 +799,23 @@ class MacroAssembler: public Assembler {

void cmpxchgptr(Register reg, Address adr);


// cvt instructions
void cvtss2sd(XMMRegister dst, XMMRegister src);
void cvtss2sd(XMMRegister dst, Address src);
void cvtsd2ss(XMMRegister dst, XMMRegister src);
void cvtsd2ss(XMMRegister dst, Address src);
void cvtsi2sdl(XMMRegister dst, Register src);
void cvtsi2sdl(XMMRegister dst, Address src);
void cvtsi2ssl(XMMRegister dst, Register src);
void cvtsi2ssl(XMMRegister dst, Address src);
#ifdef _LP64
void cvtsi2sdq(XMMRegister dst, Register src);
void cvtsi2sdq(XMMRegister dst, Address src);
void cvtsi2ssq(XMMRegister dst, Register src);
void cvtsi2ssq(XMMRegister dst, Address src);
#endif

void locked_cmpxchgptr(Register reg, AddressLiteral adr, Register rscratch = noreg);

void imulptr(Register dst, Register src) { LP64_ONLY(imulq(dst, src)) NOT_LP64(imull(dst, src)); }
Expand Down
4 changes: 2 additions & 2 deletions src/hotspot/cpu/x86/x86_64.ad
Original file line number Diff line number Diff line change
Expand Up @@ -10940,7 +10940,7 @@ instruct cmpD_imm(rRegI dst, regD src, immD con, rFlagsReg cr) %{
instruct convF2D_reg_reg(regD dst, regF src)
%{
match(Set dst (ConvF2D src));

effect(TEMP dst);
format %{ "cvtss2sd $dst, $src" %}
ins_encode %{
__ cvtss2sd ($dst$$XMMRegister, $src$$XMMRegister);
Expand All @@ -10962,7 +10962,7 @@ instruct convF2D_reg_mem(regD dst, memory src)
instruct convD2F_reg_reg(regF dst, regD src)
%{
match(Set dst (ConvD2F src));

effect(TEMP dst);
format %{ "cvtsd2ss $dst, $src" %}
ins_encode %{
__ cvtsd2ss ($dst$$XMMRegister, $src$$XMMRegister);
Expand Down
142 changes: 142 additions & 0 deletions test/micro/org/openjdk/bench/vm/compiler/x86/ComputePI.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
/*
* Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved.
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
*
* This code is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License version 2 only, as
* published by the Free Software Foundation.
*
* This code is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
* version 2 for more details (a copy is included in the LICENSE file that
* accompanied this code).
*
* You should have received a copy of the GNU General Public License version
* 2 along with this work; if not, write to the Free Software Foundation,
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
*
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
* or visit www.oracle.com if you need additional information or have any
* questions.
*/

package org.openjdk.bench.vm.compiler;

import org.openjdk.jmh.annotations.Benchmark;
import org.openjdk.jmh.annotations.BenchmarkMode;
import org.openjdk.jmh.annotations.Fork;
import org.openjdk.jmh.annotations.Measurement;
import org.openjdk.jmh.annotations.Mode;
import org.openjdk.jmh.annotations.OutputTimeUnit;
import org.openjdk.jmh.annotations.Scope;
import org.openjdk.jmh.annotations.Setup;
import org.openjdk.jmh.annotations.State;
import org.openjdk.jmh.annotations.Warmup;

import java.util.concurrent.TimeUnit;

@State(Scope.Thread)
@BenchmarkMode(Mode.AverageTime)
@OutputTimeUnit(TimeUnit.NANOSECONDS)
@Warmup(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS)
@Measurement(iterations = 5, time = 5, timeUnit = TimeUnit.SECONDS)
@Fork(value = 3)
public class ComputePI {

@Benchmark
public double compute_pi_int_dbl() {
double pi = 4.0;
boolean sign = false;

for (int i = 3; i < 1000; i += 2) {
if (sign) {
pi += 4.0 / i;
} else {
pi -= 4.0 / i;
}
sign = !sign;
}
return pi;
}

@Benchmark
public double compute_pi_int_flt() {
float pi = 4.0f;
boolean sign = false;

for (int i = 3; i < 1000; i += 2) {
if (sign) {
pi += 4.0f / i;
} else {
pi -= 4.0f / i;
}
sign = !sign;
}
return pi;
}

@Benchmark
public double compute_pi_long_dbl() {
double pi = 4.0;
boolean sign = false;

for (long i = 3; i < 1000; i += 2) {
if (sign) {
pi += 4.0 / i;
} else {
pi -= 4.0 / i;
}
sign = !sign;
}
return pi;
}

@Benchmark
public double compute_pi_long_flt() {
float pi = 4.0f;
boolean sign = false;

for (long i = 3; i < 1000; i += 2) {
if (sign) {
pi += 4.0f / i;
} else {
pi -= 4.0f / i;
}
sign = !sign;
}
return pi;
}

@Benchmark
public double compute_pi_flt_dbl() {
double pi = 4.0;
boolean sign = false;

for (float i = 3.0f; i < 1000.0f; i += 2.0f) {
if (sign) {
pi += 4.0 / i;
} else {
pi -= 4.0 / i;
}
sign = !sign;
}
return pi;
}

@Benchmark
public double compute_pi_dbl_flt() {
float pi = 4.0f;
boolean sign = false;

for (float i = 3.0f; i < 1000.0f; i += 2.0f) {
if (sign) {
pi += 4.0f / i;
} else {
pi -= 4.0f / i;
}
sign = !sign;
}
return pi;
}
}

1 comment on commit 0941f2c

@openjdk-notifier
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.