Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8266054: VectorAPI rotate operation optimization #3720

Closed
wants to merge 19 commits into from
Closed
Changes from all commits
Commits
File filter
Filter file types
Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.

Always

Just for now

@@ -138,6 +138,11 @@
return false;
}

// Does the CPU supports vector constant rotate instructions?
static constexpr bool supports_vector_constant_rotates(int shift) {
return false;
}

// Does the CPU supports vector unsigned comparison instructions?
static const bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
// Not supported on SVE yet.
@@ -131,6 +131,11 @@
return false; // not supported
}

// Does the CPU supports vector constant rotate instructions?
static constexpr bool supports_vector_constant_rotates(int shift) {
return false;
}

// Does the CPU supports vector unsigned comparison instructions?
static constexpr bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
return false;
@@ -138,6 +138,11 @@
return false;
}

// Does the CPU supports vector constant rotate instructions?
static constexpr bool supports_vector_constant_rotates(int shift) {
return false;
}

// Does the CPU supports vector unsigned comparison instructions?
static constexpr bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
return false;
@@ -128,6 +128,11 @@
return false;
}

// Does the CPU supports vector constant rotate instructions?
static constexpr bool supports_vector_constant_rotates(int shift) {
return false;
}

// Does the CPU supports vector unsigned comparison instructions?
static constexpr bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
return false;
@@ -158,6 +158,11 @@
return true;
}

// Does the CPU supports vector constant rotate instructions?
static constexpr bool supports_vector_constant_rotates(int shift) {
return -0x80 <= shift && shift < 0x80;
}

// Does the CPU supports vector unsigned comparison instructions?
static const bool supports_vector_comparison_unsigned(int vlen, BasicType bt) {
int vlen_in_bytes = vlen * type2aelembytes(bt);
@@ -1671,6 +1671,9 @@ const bool Matcher::match_rule_supported_vector(int opcode, int vlen, BasicType
break;
case Op_RotateRightV:
case Op_RotateLeftV:
if (bt != T_INT && bt != T_LONG) {
return false;
} // fallthrough
case Op_MacroLogicV:
if (!VM_Version::supports_evex() ||
((size_in_bits != 512) && !VM_Version::supports_avx512vl())) {
@@ -336,6 +336,7 @@ class LibraryCallKit : public GraphKit {
};

bool arch_supports_vector(int op, int num_elem, BasicType type, VectorMaskUseType mask_use_type, bool has_scalar_args = false);
bool arch_supports_vector_rotate(int opc, int num_elem, BasicType elem_bt, bool has_scalar_args = false);

void clear_upper_avx() {
#ifdef X86
@@ -2488,9 +2488,8 @@ void SuperWord::output() {
} else if (VectorNode::is_scalar_rotate(n)) {
Node* in1 = low_adr->in(1);
Node* in2 = p->at(0)->in(2);
assert(in2->bottom_type()->isa_int(), "Shift must always be an int value");
// If rotation count is non-constant or greater than 8bit value create a vector.
if (!in2->is_Con() || -0x80 > in2->get_int() || in2->get_int() >= 0x80) {
if (!in2->is_Con() || !Matcher::supports_vector_constant_rotates(in2->get_int())) {
in2 = vector_opd(p, 2);
}
vn = VectorNode::make(opc, in1, in2, vlen, velt_basic_type(n));
@@ -59,6 +59,48 @@ static bool check_vbox(const TypeInstPtr* vbox_type) {
}
#endif

bool LibraryCallKit::arch_supports_vector_rotate(int opc, int num_elem, BasicType elem_bt, bool has_scalar_args) {
bool is_supported = true;
// has_scalar_args flag is true only for non-constant scalar shift count,
// since in this case shift needs to be broadcasted.
if (!Matcher::match_rule_supported_vector(opc, num_elem, elem_bt) ||
(has_scalar_args &&
!arch_supports_vector(VectorNode::replicate_opcode(elem_bt), num_elem, elem_bt, VecMaskNotUsed))) {
is_supported = false;
}

int lshiftopc, rshiftopc;
switch(elem_bt) {
case T_BYTE:
lshiftopc = Op_LShiftI;
rshiftopc = Op_URShiftB;
break;
case T_SHORT:
lshiftopc = Op_LShiftI;
rshiftopc = Op_URShiftS;
break;
case T_INT:
lshiftopc = Op_LShiftI;
rshiftopc = Op_URShiftI;
break;
case T_LONG:
lshiftopc = Op_LShiftL;
rshiftopc = Op_URShiftL;
break;
default:
assert(false, "Unexpected type");
}
int lshiftvopc = VectorNode::opcode(lshiftopc, elem_bt);
int rshiftvopc = VectorNode::opcode(rshiftopc, elem_bt);
if (!is_supported &&
arch_supports_vector(lshiftvopc, num_elem, elem_bt, VecMaskNotUsed) &&
arch_supports_vector(rshiftvopc, num_elem, elem_bt, VecMaskNotUsed) &&
arch_supports_vector(Op_OrV, num_elem, elem_bt, VecMaskNotUsed)) {
is_supported = true;
}
return is_supported;
}

Node* GraphKit::box_vector(Node* vector, const TypeInstPtr* vbox_type, BasicType elem_bt, int num_elem, bool deoptimize_on_exception) {
assert(EnableVectorSupport, "");

@@ -112,17 +154,29 @@ bool LibraryCallKit::arch_supports_vector(int sopc, int num_elem, BasicType type
return false;
}

// Check that architecture supports this op-size-type combination.
if (!Matcher::match_rule_supported_vector(sopc, num_elem, type)) {
if (VectorNode::is_vector_rotate(sopc)) {
if(!arch_supports_vector_rotate(sopc, num_elem, type, has_scalar_args)) {
#ifndef PRODUCT
if (C->print_intrinsics()) {
tty->print_cr(" ** Rejected vector op (%s,%s,%d) because architecture does not support it",
NodeClassNames[sopc], type2name(type), num_elem);
}
if (C->print_intrinsics()) {
tty->print_cr(" ** Rejected vector op (%s,%s,%d) because architecture does not support variable vector shifts",
NodeClassNames[sopc], type2name(type), num_elem);
}
#endif
return false;
return false;
}
} else {
assert(Matcher::match_rule_supported(sopc), "must be supported");
// Check that architecture supports this op-size-type combination.
if (!Matcher::match_rule_supported_vector(sopc, num_elem, type)) {
#ifndef PRODUCT
if (C->print_intrinsics()) {
tty->print_cr(" ** Rejected vector op (%s,%s,%d) because architecture does not support it",
NodeClassNames[sopc], type2name(type), num_elem);
}
#endif
return false;
} else {
assert(Matcher::match_rule_supported(sopc), "must be supported");
}
}

if (num_elem == 1) {
@@ -1500,7 +1554,9 @@ bool LibraryCallKit::inline_vector_broadcast_int() {
BasicType elem_bt = elem_type->basic_type();
int num_elem = vlen->get_con();
int opc = VectorSupport::vop2ideal(opr->get_con(), elem_bt);
if (opc == 0 || !VectorNode::is_shift_opcode(opc)) {
bool is_shift = VectorNode::is_shift_opcode(opc);
bool is_rotate = VectorNode::is_rotate_opcode(opc);
if (opc == 0 || (!is_shift && !is_rotate)) {
if (C->print_intrinsics()) {
tty->print_cr(" ** operation not supported: op=%d bt=%s", opr->get_con(), type2name(elem_bt));
}
@@ -1513,18 +1569,37 @@ bool LibraryCallKit::inline_vector_broadcast_int() {
}
return false; // operation not supported
}
Node* cnt = argument(5);
ciKlass* vbox_klass = vector_klass->const_oop()->as_instance()->java_lang_Class_klass();
const TypeInstPtr* vbox_type = TypeInstPtr::make_exact(TypePtr::NotNull, vbox_klass);
const TypeInt* cnt_type = cnt->bottom_type()->isa_int();

if (!arch_supports_vector(sopc, num_elem, elem_bt, VecMaskNotUsed, true /*has_scalar_args*/)) {
// If CPU supports vector constant rotate instructions pass it directly
bool is_const_rotate = is_rotate && cnt_type && cnt_type->is_con() &&
Matcher::supports_vector_constant_rotates(cnt_type->get_con());
bool has_scalar_args = is_rotate ? !is_const_rotate : true;
if (!arch_supports_vector(sopc, num_elem, elem_bt, VecMaskNotUsed, has_scalar_args)) {
if (C->print_intrinsics()) {
tty->print_cr(" ** not supported: arity=0 op=int/%d vlen=%d etype=%s ismask=no",
sopc, num_elem, type2name(elem_bt));
}
return false; // not supported
}
Node* opd1 = unbox_vector(argument(4), vbox_type, elem_bt, num_elem);
Node* opd2 = vector_shift_count(argument(5), opc, elem_bt, num_elem);
Node* opd2 = NULL;
if (is_shift) {
opd2 = vector_shift_count(cnt, opc, elem_bt, num_elem);
} else {
assert(is_rotate, "unexpected operation");
if (!is_const_rotate) {
const Type * type_bt = Type::get_const_basic_type(elem_bt);
cnt = elem_bt == T_LONG ? gvn().transform(new ConvI2LNode(cnt)) : cnt;
opd2 = gvn().transform(VectorNode::scalar2vector(cnt, num_elem, type_bt));
} else {
Comment on lines +1594 to +1598

This comment has been minimized.

@sviswa7

sviswa7 Jul 27, 2021

Why conversion for only T_LONG and not for T_BYTE and T_SHORT? Is there an assumption here that only T_INT and T_LONG elem_bt are supported?

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Jul 27, 2021
Author Member

Correcting this, I2L may be needed in auto-vectorization flow since Integer/Long.rotate[Right/Left] APIs accept only integral shift, so for Long.rotate* operations integral shift value must be converted to long using I2L before broadcasting it. VectorAPI lanewise operations between vector-scalar, scalar type already matches with vector basic type. Since degeneration routine is common b/w both the flows so maintaining IR consistency here.

This comment has been minimized.

@sviswa7

sviswa7 Jul 27, 2021

For Vector API the shift is always coming in as int type for rotate by scalar (lanewiseShiftTemplate). The down conversion to byte or short needs to be done before scalar2vector.

This comment has been minimized.

@sviswa7

sviswa7 Jul 27, 2021

I see that similar thing is done before for shift, so down conversion to sub type is not required.

// Constant shift value.
opd2 = cnt;
}
}
if (opd1 == NULL || opd2 == NULL) {
return false;
}
@@ -142,9 +142,9 @@ int VectorNode::opcode(int sopc, BasicType bt) {
case Op_RoundDoubleMode:
return (bt == T_DOUBLE ? Op_RoundDoubleModeV : 0);
case Op_RotateLeft:
return (bt == T_LONG || bt == T_INT ? Op_RotateLeftV : 0);
return (is_integral_type(bt) ? Op_RotateLeftV : 0);
case Op_RotateRight:
return (bt == T_LONG || bt == T_INT ? Op_RotateRightV : 0);
return (is_integral_type(bt) ? Op_RotateRightV : 0);
case Op_SqrtF:
return (bt == T_FLOAT ? Op_SqrtVF : 0);
case Op_SqrtD:
@@ -261,7 +261,7 @@ bool VectorNode::implemented(int opc, uint vlen, BasicType bt) {
// For rotate operation we will do a lazy de-generation into
// OrV/LShiftV/URShiftV pattern if the target does not support
// vector rotation instruction.
if (vopc == Op_RotateLeftV || vopc == Op_RotateRightV) {
if (VectorNode::is_vector_rotate(vopc)) {
return is_vector_rotate_supported(vopc, vlen, bt);
}
return vopc > 0 && Matcher::match_rule_supported_vector(vopc, vlen, bt);
@@ -295,15 +295,8 @@ bool VectorNode::is_roundopD(Node* n) {
return false;
}

bool VectorNode::is_scalar_rotate(Node* n) {
if (n->Opcode() == Op_RotateLeft || n->Opcode() == Op_RotateRight) {
return true;
}
return false;
}

bool VectorNode::is_vector_rotate_supported(int vopc, uint vlen, BasicType bt) {
assert(vopc == Op_RotateLeftV || vopc == Op_RotateRightV, "wrong opcode");
assert(VectorNode::is_vector_rotate(vopc), "wrong opcode");

// If target defines vector rotation patterns then no
// need for degeneration.
@@ -347,6 +340,23 @@ bool VectorNode::is_shift(Node* n) {
return is_shift_opcode(n->Opcode());
}

bool VectorNode::is_rotate_opcode(int opc) {
switch (opc) {
case Op_RotateRight:
case Op_RotateLeft:
return true;
default:
return false;
}
}

bool VectorNode::is_scalar_rotate(Node* n) {
if (is_rotate_opcode(n->Opcode())) {
return true;
}
return false;
}

bool VectorNode::is_vshift_cnt(Node* n) {
switch (n->Opcode()) {
case Op_LShiftCntV:
@@ -578,6 +588,16 @@ VectorNode* VectorNode::shift_count(int opc, Node* cnt, uint vlen, BasicType bt)
}
}

bool VectorNode::is_vector_rotate(int opc) {
switch (opc) {
case Op_RotateLeftV:
case Op_RotateRightV:
return true;
default:
return false;
}
}

bool VectorNode::is_vector_shift(int opc) {
assert(opc > _last_machine_leaf && opc < _last_opcode, "invalid opcode");
switch (opc) {
@@ -1131,42 +1151,66 @@ MacroLogicVNode* MacroLogicVNode::make(PhaseGVN& gvn, Node* in1, Node* in2, Node

Node* VectorNode::degenerate_vector_rotate(Node* src, Node* cnt, bool is_rotate_left,
int vlen, BasicType bt, PhaseGVN* phase) {
assert(bt == T_INT || bt == T_LONG, "sanity");
assert(is_integral_type(bt), "sanity");
const TypeVect* vt = TypeVect::make(bt, vlen);

int shift_mask = (bt == T_INT) ? 0x1F : 0x3F;
int shiftLOpc = (bt == T_INT) ? Op_LShiftI : Op_LShiftL;
int shiftROpc = (bt == T_INT) ? Op_URShiftI: Op_URShiftL;
int shift_mask = (type2aelembytes(bt) * 8) - 1;
int shiftLOpc = (bt == T_LONG) ? Op_LShiftL : Op_LShiftI;
auto urshiftopc = [=]() {
switch(bt) {
case T_INT: return Op_URShiftI;
case T_LONG: return Op_URShiftL;
case T_BYTE: return Op_URShiftB;
case T_SHORT: return Op_URShiftS;
default: return (Opcodes)0;
}
};
int shiftROpc = urshiftopc();

// Compute shift values for right rotation and
// later swap them in case of left rotation.
Node* shiftRCnt = NULL;
Node* shiftLCnt = NULL;
if (cnt->is_Con() && cnt->bottom_type()->isa_int()) {
// Constant shift case.
int shift = cnt->get_int() & shift_mask;
const TypeInt* cnt_type = cnt->bottom_type()->isa_int();
bool is_binary_vector_op = false;
if (cnt_type && cnt_type->is_con()) {
// Constant shift.
int shift = cnt_type->get_con() & shift_mask;
shiftRCnt = phase->intcon(shift);
shiftLCnt = phase->intcon(shift_mask + 1 - shift);
} else {
// Variable shift case.
} else if (VectorNode::is_invariant_vector(cnt)) {
// Scalar variable shift, handle replicates generated by auto vectorizer.
assert(VectorNode::is_invariant_vector(cnt), "Broadcast expected");
cnt = cnt->in(1);
if (bt == T_LONG) {
// Shift count vector for Rotate vector has long elements too.
assert(cnt->Opcode() == Op_ConvI2L, "ConvI2L expected");
cnt = cnt->in(1);
}
shiftRCnt = phase->transform(new AndINode(cnt, phase->intcon(shift_mask)));
shiftRCnt = cnt;

This comment has been minimized.

@sviswa7

sviswa7 Jul 16, 2021

Why do we remove the And with mask here?

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Jul 18, 2021
Author Member

And'ing with shift_mask is already done on Java API side implementation before making a call to intrinsic rountine.

This comment has been minimized.

@sviswa7

sviswa7 Jul 21, 2021

This path is also taken from non vector api path, wont masking be needed there?

This comment has been minimized.

@sviswa7

sviswa7 Jul 26, 2021

@jatin-bhateja This question is still pending.

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Jul 26, 2021
Author Member

@sviswa7, SLP flow will either have a constant 8bit shift value or a variable shift present in vector, this also include broadcasted non-constant shift value.

This comment has been minimized.

@theRealELiu

theRealELiu Jul 27, 2021
Contributor

It would be better comment here, since the correctness relay on some others.

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Jul 27, 2021
Author Member

@theRealELiu , @sviswa7 , comment already exist in code, I guess I mentioned incorrectly earlier on this thread, rectified my comments.

shiftLCnt = phase->transform(new SubINode(phase->intcon(shift_mask + 1), shiftRCnt));
} else {
// Vector variable shift.
assert(cnt->bottom_type()->isa_vect(), "Unexpected shift");
const Type* elem_ty = Type::get_const_basic_type(bt);
Node* shift_mask_node = (bt == T_LONG) ? (Node*)(phase->longcon(shift_mask + 1L)) :
(Node*)(phase->intcon(shift_mask + 1));
Node* vector_mask = phase->transform(VectorNode::scalar2vector(shift_mask_node,vlen, elem_ty));
int subVopc = VectorNode::opcode((bt == T_LONG) ? Op_SubL : Op_SubI, bt);
Comment on lines +1196 to +1199

This comment has been minimized.

@sviswa7

sviswa7 Jul 27, 2021

There seems to be an assumption here that the vector type is INT or LONG only and not subword type. From Vector API you can get the sub word types as well.
Also if this path is coming from auto-vectorizer, don't we need masking here?

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Jul 27, 2021
Author Member

Subtype is being passed to VectorNode::opcode for correct opcode selection. Also shift_mask_node is a constant value node, so there is no assumption on vector type. Wrap around (masking) for shift value may not be needed here since we are degenerating rotate into shifts (logical left and rights). Also scalar rotate nodes getting into SLP flow wraps the constant shift values appropriately during RotateLeft/RightNode Idealizations.

shiftRCnt = cnt;
shiftLCnt = phase->transform(VectorNode::make(subVopc, vector_mask, shiftRCnt, vt));
is_binary_vector_op = true;
}

// Swap the computed left and right shift counts.
if (is_rotate_left) {
swap(shiftRCnt,shiftLCnt);
}

shiftLCnt = phase->transform(new LShiftCntVNode(shiftLCnt, vt));
shiftRCnt = phase->transform(new RShiftCntVNode(shiftRCnt, vt));
if (!is_binary_vector_op) {
shiftLCnt = phase->transform(new LShiftCntVNode(shiftLCnt, vt));
shiftRCnt = phase->transform(new RShiftCntVNode(shiftRCnt, vt));
}

return new OrVNode(phase->transform(VectorNode::make(shiftLOpc, src, shiftLCnt, vlen, bt)),
phase->transform(VectorNode::make(shiftROpc, src, shiftRCnt, vlen, bt)),
ProTip! Use n and p to navigate between commits in a pull request.