Skip to content

Commit d23a8bf

Browse files
changpeng1997adinn
authored andcommitted
8297753: AArch64: Add optimized rules for vector compare with zero on NEON
Reviewed-by: aph
1 parent 339ca88 commit d23a8bf

File tree

11 files changed

+1048
-506
lines changed

11 files changed

+1048
-506
lines changed

src/hotspot/cpu/aarch64/aarch64.ad

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1191,6 +1191,10 @@ public:
11911191

11921192
// predicate controlling addressing modes
11931193
bool size_fits_all_mem_uses(AddPNode* addp, int shift);
1194+
1195+
// Convert BootTest condition to Assembler condition.
1196+
// Replicate the logic of cmpOpOper::ccode() and cmpOpUOper::ccode().
1197+
Assembler::Condition to_assembler_cond(BoolTest::mask cond);
11941198
%}
11951199

11961200
source %{
@@ -2527,6 +2531,50 @@ bool size_fits_all_mem_uses(AddPNode* addp, int shift) {
25272531
return true;
25282532
}
25292533

2534+
// Convert BootTest condition to Assembler condition.
2535+
// Replicate the logic of cmpOpOper::ccode() and cmpOpUOper::ccode().
2536+
Assembler::Condition to_assembler_cond(BoolTest::mask cond) {
2537+
Assembler::Condition result;
2538+
switch(cond) {
2539+
case BoolTest::eq:
2540+
result = Assembler::EQ; break;
2541+
case BoolTest::ne:
2542+
result = Assembler::NE; break;
2543+
case BoolTest::le:
2544+
result = Assembler::LE; break;
2545+
case BoolTest::ge:
2546+
result = Assembler::GE; break;
2547+
case BoolTest::lt:
2548+
result = Assembler::LT; break;
2549+
case BoolTest::gt:
2550+
result = Assembler::GT; break;
2551+
case BoolTest::ule:
2552+
result = Assembler::LS; break;
2553+
case BoolTest::uge:
2554+
result = Assembler::HS; break;
2555+
case BoolTest::ult:
2556+
result = Assembler::LO; break;
2557+
case BoolTest::ugt:
2558+
result = Assembler::HI; break;
2559+
case BoolTest::overflow:
2560+
result = Assembler::VS; break;
2561+
case BoolTest::no_overflow:
2562+
result = Assembler::VC; break;
2563+
default:
2564+
ShouldNotReachHere();
2565+
return Assembler::Condition(-1);
2566+
}
2567+
2568+
// Check conversion
2569+
if (cond & BoolTest::unsigned_compare) {
2570+
assert(cmpOpUOper((BoolTest::mask)((int)cond & ~(BoolTest::unsigned_compare))).ccode() == result, "Invalid conversion");
2571+
} else {
2572+
assert(cmpOpOper(cond).ccode() == result, "Invalid conversion");
2573+
}
2574+
2575+
return result;
2576+
}
2577+
25302578
// Binary src (Replicate con)
25312579
bool is_valid_sve_arith_imm_pattern(Node* n, Node* m) {
25322580
if (n == NULL || m == NULL) {
@@ -4263,6 +4311,17 @@ operand immI_positive()
42634311
interface(CONST_INTER);
42644312
%}
42654313

4314+
// BoolTest condition for signed compare
4315+
operand immI_cmp_cond()
4316+
%{
4317+
predicate(n->get_int() < (int)(BoolTest::unsigned_compare));
4318+
match(ConI);
4319+
4320+
op_cost(0);
4321+
format %{ %}
4322+
interface(CONST_INTER);
4323+
%}
4324+
42664325
operand immL_255()
42674326
%{
42684327
predicate(n->get_long() == 255L);

src/hotspot/cpu/aarch64/aarch64_vector.ad

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5137,6 +5137,61 @@ instruct vmaskcmp_neon(vReg dst, vReg src1, vReg src2, immI cond) %{
51375137
ins_pipe(pipe_slow);
51385138
%}
51395139

5140+
instruct vmaskcmp_zeroI_neon(vReg dst, vReg src, immI0 zero, immI_cmp_cond cond) %{
5141+
predicate(UseSVE == 0);
5142+
match(Set dst (VectorMaskCmp (Binary src (ReplicateB zero)) cond));
5143+
match(Set dst (VectorMaskCmp (Binary src (ReplicateS zero)) cond));
5144+
match(Set dst (VectorMaskCmp (Binary src (ReplicateI zero)) cond));
5145+
format %{ "vmaskcmp_zeroI_neon $dst, $src, #0, $cond" %}
5146+
ins_encode %{
5147+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
5148+
BasicType bt = Matcher::vector_element_basic_type(this);
5149+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5150+
__ neon_compare_zero($dst$$FloatRegister, bt, $src$$FloatRegister,
5151+
condition, /* isQ */ length_in_bytes == 16);
5152+
%}
5153+
ins_pipe(pipe_slow);
5154+
%}
5155+
5156+
instruct vmaskcmp_zeroL_neon(vReg dst, vReg src, immL0 zero, immI_cmp_cond cond) %{
5157+
predicate(UseSVE == 0);
5158+
match(Set dst (VectorMaskCmp (Binary src (ReplicateL zero)) cond));
5159+
format %{ "vmaskcmp_zeroL_neon $dst, $src, #0, $cond" %}
5160+
ins_encode %{
5161+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
5162+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5163+
__ neon_compare_zero($dst$$FloatRegister, T_LONG, $src$$FloatRegister,
5164+
condition, /* isQ */ length_in_bytes == 16);
5165+
%}
5166+
ins_pipe(pipe_slow);
5167+
%}
5168+
5169+
instruct vmaskcmp_zeroF_neon(vReg dst, vReg src, immF0 zero, immI_cmp_cond cond) %{
5170+
predicate(UseSVE == 0);
5171+
match(Set dst (VectorMaskCmp (Binary src (ReplicateF zero)) cond));
5172+
format %{ "vmaskcmp_zeroF_neon $dst, $src, #0, $cond" %}
5173+
ins_encode %{
5174+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
5175+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5176+
__ neon_compare_zero($dst$$FloatRegister, T_FLOAT, $src$$FloatRegister,
5177+
condition, /* isQ */ length_in_bytes == 16);
5178+
%}
5179+
ins_pipe(pipe_slow);
5180+
%}
5181+
5182+
instruct vmaskcmp_zeroD_neon(vReg dst, vReg src, immD0 zero, immI_cmp_cond cond) %{
5183+
predicate(UseSVE == 0);
5184+
match(Set dst (VectorMaskCmp (Binary src (ReplicateD zero)) cond));
5185+
format %{ "vmaskcmp_zeroD_neon $dst, $src, #0, $cond" %}
5186+
ins_encode %{
5187+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
5188+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
5189+
__ neon_compare_zero($dst$$FloatRegister, T_DOUBLE, $src$$FloatRegister,
5190+
condition, /* isQ */ length_in_bytes == 16);
5191+
%}
5192+
ins_pipe(pipe_slow);
5193+
%}
5194+
51405195
instruct vmaskcmp_sve(pReg dst, vReg src1, vReg src2, immI cond, rFlagsReg cr) %{
51415196
predicate(UseSVE > 0);
51425197
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));

src/hotspot/cpu/aarch64/aarch64_vector_ad.m4

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3553,6 +3553,42 @@ instruct vmaskcmp_neon(vReg dst, vReg src1, vReg src2, immI cond) %{
35533553
ins_pipe(pipe_slow);
35543554
%}
35553555

3556+
instruct vmaskcmp_zeroI_neon(vReg dst, vReg src, immI0 zero, immI_cmp_cond cond) %{
3557+
predicate(UseSVE == 0);
3558+
match(Set dst (VectorMaskCmp (Binary src (ReplicateB zero)) cond));
3559+
match(Set dst (VectorMaskCmp (Binary src (ReplicateS zero)) cond));
3560+
match(Set dst (VectorMaskCmp (Binary src (ReplicateI zero)) cond));
3561+
format %{ "vmaskcmp_zeroI_neon $dst, $src, #0, $cond" %}
3562+
ins_encode %{
3563+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
3564+
BasicType bt = Matcher::vector_element_basic_type(this);
3565+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
3566+
__ neon_compare_zero($dst$$FloatRegister, bt, $src$$FloatRegister,
3567+
condition, /* isQ */ length_in_bytes == 16);
3568+
%}
3569+
ins_pipe(pipe_slow);
3570+
%}
3571+
dnl
3572+
dnl VMASKCMP_ZERO_NEON($1, $2 )
3573+
dnl VMASKCMP_ZERO_NEON(type, basic_type)
3574+
define(`VMASKCMP_ZERO_NEON', `
3575+
instruct vmaskcmp_zero$1_neon(vReg dst, vReg src, imm`$1'0 zero, immI_cmp_cond cond) %{
3576+
predicate(UseSVE == 0);
3577+
match(Set dst (VectorMaskCmp (Binary src (Replicate$1 zero)) cond));
3578+
format %{ "vmaskcmp_zero$1_neon $dst, $src, #0, $cond" %}
3579+
ins_encode %{
3580+
Assembler::Condition condition = to_assembler_cond((BoolTest::mask)$cond$$constant);
3581+
uint length_in_bytes = Matcher::vector_length_in_bytes(this);
3582+
__ neon_compare_zero($dst$$FloatRegister, $2, $src$$FloatRegister,
3583+
condition, /* isQ */ length_in_bytes == 16);
3584+
%}
3585+
ins_pipe(pipe_slow);
3586+
%}')dnl
3587+
dnl
3588+
VMASKCMP_ZERO_NEON(L, T_LONG)
3589+
VMASKCMP_ZERO_NEON(F, T_FLOAT)
3590+
VMASKCMP_ZERO_NEON(D, T_DOUBLE)
3591+
35563592
instruct vmaskcmp_sve(pReg dst, vReg src1, vReg src2, immI cond, rFlagsReg cr) %{
35573593
predicate(UseSVE > 0);
35583594
match(Set dst (VectorMaskCmp (Binary src1 src2) cond));

src/hotspot/cpu/aarch64/assembler_aarch64.hpp

Lines changed: 42 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2653,12 +2653,6 @@ template<typename R, typename... Rx>
26532653
INSN(cnt, 0, 0b100000010110, 0); // accepted arrangements: T8B, T16B
26542654
INSN(uaddlp, 1, 0b100000001010, 2); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S
26552655
INSN(uaddlv, 1, 0b110000001110, 1); // accepted arrangements: T8B, T16B, T4H, T8H, T4S
2656-
// Zero compare.
2657-
INSN(cmeq, 0, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
2658-
INSN(cmge, 1, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
2659-
INSN(cmgt, 0, 0b100000100010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
2660-
INSN(cmle, 1, 0b100000100110, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
2661-
INSN(cmlt, 0, 0b100000101010, 3); // accepted arrangements: T8B, T16B, T4H, T8H, T2S, T4S, T2D
26622656

26632657
#undef INSN
26642658

@@ -3190,6 +3184,48 @@ template<typename R, typename... Rx>
31903184

31913185
#undef INSN
31923186

3187+
// AdvSIMD compare with zero (vector)
3188+
void cm(Condition cond, FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
3189+
starti;
3190+
assert(T != T1Q && T != T1D, "invalid arrangement");
3191+
int cond_op;
3192+
switch (cond) {
3193+
case EQ: cond_op = 0b001; break;
3194+
case GE: cond_op = 0b100; break;
3195+
case GT: cond_op = 0b000; break;
3196+
case LE: cond_op = 0b101; break;
3197+
case LT: cond_op = 0b010; break;
3198+
default:
3199+
ShouldNotReachHere();
3200+
break;
3201+
}
3202+
3203+
f(0, 31), f((int)T & 1, 30), f((cond_op >> 2) & 1, 29);
3204+
f(0b01110, 28, 24), f((int)T >> 1, 23, 22), f(0b10000010, 21, 14);
3205+
f(cond_op & 0b11, 13, 12), f(0b10, 11, 10), rf(Vn, 5), rf(Vd, 0);
3206+
}
3207+
3208+
// AdvSIMD Floating-point compare with zero (vector)
3209+
void fcm(Condition cond, FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn) {
3210+
starti;
3211+
assert(T == T2S || T == T4S || T == T2D, "invalid arrangement");
3212+
int cond_op;
3213+
switch (cond) {
3214+
case EQ: cond_op = 0b010; break;
3215+
case GT: cond_op = 0b000; break;
3216+
case GE: cond_op = 0b001; break;
3217+
case LE: cond_op = 0b011; break;
3218+
case LT: cond_op = 0b100; break;
3219+
default:
3220+
ShouldNotReachHere();
3221+
break;
3222+
}
3223+
3224+
f(0, 31), f((int)T & 1, 30), f(cond_op & 1, 29), f(0b011101, 28, 23);
3225+
f(((int)(T >> 1) & 1), 22), f(0b10000011, 21, 14);
3226+
f((cond_op >> 1) & 0b11, 13, 12), f(0b10, 11, 10), rf(Vn, 5), rf(Vd, 0);
3227+
}
3228+
31933229
void ext(FloatRegister Vd, SIMD_Arrangement T, FloatRegister Vn, FloatRegister Vm, int index)
31943230
{
31953231
starti;

src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.cpp

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -925,7 +925,7 @@ void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegis
925925
case BoolTest::eq: fcmeq(dst, size, src1, src2); break;
926926
case BoolTest::ne: {
927927
fcmeq(dst, size, src1, src2);
928-
notr(dst, T16B, dst);
928+
notr(dst, isQ ? T16B : T8B, dst);
929929
break;
930930
}
931931
case BoolTest::ge: fcmge(dst, size, src1, src2); break;
@@ -941,7 +941,7 @@ void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegis
941941
case BoolTest::eq: cmeq(dst, size, src1, src2); break;
942942
case BoolTest::ne: {
943943
cmeq(dst, size, src1, src2);
944-
notr(dst, T16B, dst);
944+
notr(dst, isQ ? T16B : T8B, dst);
945945
break;
946946
}
947947
case BoolTest::ge: cmge(dst, size, src1, src2); break;
@@ -959,6 +959,26 @@ void C2_MacroAssembler::neon_compare(FloatRegister dst, BasicType bt, FloatRegis
959959
}
960960
}
961961

962+
void C2_MacroAssembler::neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
963+
Condition cond, bool isQ) {
964+
SIMD_Arrangement size = esize2arrangement((unsigned)type2aelembytes(bt), isQ);
965+
if (bt == T_FLOAT || bt == T_DOUBLE) {
966+
if (cond == Assembler::NE) {
967+
fcm(Assembler::EQ, dst, size, src);
968+
notr(dst, isQ ? T16B : T8B, dst);
969+
} else {
970+
fcm(cond, dst, size, src);
971+
}
972+
} else {
973+
if (cond == Assembler::NE) {
974+
cm(Assembler::EQ, dst, size, src);
975+
notr(dst, isQ ? T16B : T8B, dst);
976+
} else {
977+
cm(cond, dst, size, src);
978+
}
979+
}
980+
}
981+
962982
// Compress the least significant bit of each byte to the rightmost and clear
963983
// the higher garbage bits.
964984
void C2_MacroAssembler::bytemask_compress(Register dst) {

src/hotspot/cpu/aarch64/c2_MacroAssembler_aarch64.hpp

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020, 2022, Oracle and/or its affiliates. All rights reserved.
2+
* Copyright (c) 2020, 2023, Oracle and/or its affiliates. All rights reserved.
33
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
44
*
55
* This code is free software; you can redistribute it and/or modify it
@@ -79,6 +79,9 @@
7979
void neon_compare(FloatRegister dst, BasicType bt, FloatRegister src1,
8080
FloatRegister src2, int cond, bool isQ);
8181

82+
void neon_compare_zero(FloatRegister dst, BasicType bt, FloatRegister src,
83+
Condition cond, bool isQ);
84+
8285
void sve_compare(PRegister pd, BasicType bt, PRegister pg,
8386
FloatRegister zn, FloatRegister zm, int cond);
8487

src/hotspot/cpu/aarch64/macroAssembler_aarch64.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5598,7 +5598,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
55985598
// ASCII-check on lo-parts (no sign).
55995599
FloatRegister vlox = vtmp1; // Merge lower bytes.
56005600
ASCII(orr(vlox, T16B, vlo0, vlo1));
5601-
umov(chk, vhix, D, 1); ASCII(cmlt(vlox, T16B, vlox));
5601+
umov(chk, vhix, D, 1); ASCII(cm(LT, vlox, T16B, vlox));
56025602
fmovd(max, vhix); ASCII(umaxv(vlox, T16B, vlox));
56035603
orr(chk, chk, max); ASCII(umov(max, vlox, B, 0));
56045604
ASCII(orr(chk, chk, max));
@@ -5624,7 +5624,7 @@ void MacroAssembler::encode_iso_array(Register src, Register dst,
56245624
uzp2(vhi, T16B, vtmp3, vtmp3);
56255625
// ISO-check on hi-parts (all zero).
56265626
// ASCII-check on lo-parts (no sign).
5627-
ASCII(cmlt(vtmp2, T16B, vlo));
5627+
ASCII(cm(LT, vtmp2, T16B, vlo));
56285628
fmovd(chk, vhi); ASCII(umaxv(vtmp2, T16B, vtmp2));
56295629
ASCII(umov(max, vtmp2, B, 0));
56305630
ASCII(orr(chk, chk, max));

test/hotspot/gtest/aarch64/aarch64-asmtest.py

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1363,6 +1363,28 @@ class TwoRegNEONOp(CommonNEONInstruction):
13631363
class ThreeRegNEONOp(TwoRegNEONOp):
13641364
numRegs = 3
13651365

1366+
class NEONFloatCompareWithZero(TwoRegNEONOp):
1367+
def __init__(self, args):
1368+
self._name = 'fcm'
1369+
self.arrangement, self.condition = args
1370+
self.insname = self._name + (self.condition).lower()
1371+
1372+
def cstr(self):
1373+
return ("%s(%s, %s, %s, %s);"
1374+
% ("__ " + self._name,
1375+
"Assembler::" + self.condition,
1376+
self._firstSIMDreg,
1377+
"__ T" + self.arrangement,
1378+
self._firstSIMDreg.nextReg()))
1379+
1380+
def astr(self):
1381+
return ("%s\t%s.%s, %s.%s, #0.0"
1382+
% (self.insname,
1383+
self._firstSIMDreg,
1384+
self.arrangement,
1385+
self._firstSIMDreg.nextReg(),
1386+
self.arrangement))
1387+
13661388
class SpecialCases(Instruction):
13671389
def __init__(self, data):
13681390
self._name = data[0]
@@ -1596,6 +1618,16 @@ def generate(kind, names):
15961618
["fminp", "fminp", "2S"], ["fminp", "fminp", "2D"],
15971619
])
15981620

1621+
neonFloatCompareWithZeroConditions = ['GT', 'GE', 'EQ', 'LT', 'LE']
1622+
neonFloatArrangement = ['2S', '4S', '2D']
1623+
neonFloatCompareWithZeroArgs = []
1624+
for condition in neonFloatCompareWithZeroConditions:
1625+
for currentArrangement in neonFloatArrangement:
1626+
currentArgs = [currentArrangement, condition]
1627+
neonFloatCompareWithZeroArgs.append(currentArgs)
1628+
1629+
generate(NEONFloatCompareWithZero, neonFloatCompareWithZeroArgs)
1630+
15991631
generate(TwoRegNEONOp,
16001632
[["absr", "abs", "8B"], ["absr", "abs", "16B"],
16011633
["absr", "abs", "4H"], ["absr", "abs", "8H"],

0 commit comments

Comments
 (0)