Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

8264104: Eliminate unnecessary vector mask conversion during VectorUnbox for floating point VectorMask #3238

Closed
wants to merge 5 commits into from
@@ -3289,6 +3289,32 @@ instruct storemask2L(vecD dst, vecX src, immI_8 size)
ins_pipe(pipe_slow);
%}

// vector mask cast

instruct vmaskcastD(vecD dst)
%{
predicate(n->bottom_type()->is_vect()->length_in_bytes() == 8);
match(Set dst (VectorMaskCast dst));
ins_cost(0);
format %{ "vmaskcast $dst\t# empty" %}
ins_encode %{
// empty
%}
ins_pipe(pipe_class_empty);
%}

instruct vmaskcastX(vecX dst)
%{
predicate(n->bottom_type()->is_vect()->length_in_bytes() == 16);
match(Set dst (VectorMaskCast dst));
ins_cost(0);
format %{ "vmaskcast $dst\t# empty" %}
ins_encode %{
// empty
%}
ins_pipe(pipe_class_empty);
%}

//-------------------------------- LOAD_IOTA_INDICES----------------------------------

instruct loadcon8B(vecD dst, immI0 src)
@@ -1232,6 +1232,25 @@ instruct storemask2L(vecD dst, vecX src, immI_8 size)
ins_pipe(pipe_slow);
%}

// vector mask cast
dnl
define(`VECTOR_MASK_CAST', `
instruct vmaskcast$1`'(vec$1 dst)
%{
predicate(n->bottom_type()->is_vect()->length_in_bytes() == $2);
match(Set dst (VectorMaskCast dst));
ins_cost(0);
format %{ "vmaskcast $dst\t# empty" %}
ins_encode %{
// empty
%}
ins_pipe(pipe_class_empty);
%}')dnl
dnl $1 $2
VECTOR_MASK_CAST(D, 8)
VECTOR_MASK_CAST(X, 16)
dnl

//-------------------------------- LOAD_IOTA_INDICES----------------------------------
dnl
define(`PREDICATE', `ifelse($1, 8,
@@ -1700,3 +1700,17 @@ instruct vsubD(vReg dst, vReg src1, vReg src2) %{
%}
ins_pipe(pipe_slow);
%}

// vector mask cast

instruct vmaskcast(vReg dst) %{
predicate(UseSVE > 0);
match(Set dst (VectorMaskCast dst));
ins_cost(0);
format %{ "vmaskcast $dst\t# empty (sve)" %}
ins_encode %{
// empty
%}
ins_pipe(pipe_class_empty);
%}

@@ -853,3 +853,17 @@ BINARY_OP_UNPREDICATED(vsubI, SubVI, S, 4, sve_sub)
BINARY_OP_UNPREDICATED(vsubL, SubVL, D, 2, sve_sub)
BINARY_OP_UNPREDICATED(vsubF, SubVF, S, 4, sve_fsub)
BINARY_OP_UNPREDICATED(vsubD, SubVD, D, 2, sve_fsub)

// vector mask cast

instruct vmaskcast(vReg dst) %{
predicate(UseSVE > 0);
match(Set dst (VectorMaskCast dst));
This conversation was marked as resolved by XiaohongGong

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Apr 13, 2021
Member

A strict check based on in-type and out-type in predicate could strengthen the pattern.

This comment has been minimized.

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 14, 2021
Author Contributor

I'v added the type assertion in the constructor of the "VectorMaskCastNode". See :

assert(in_vt->length() == vt->length(), "vector length must match");
assert(type2aelembytes(in_vt->element_basic_type()) == type2aelembytes(vt->element_basic_type()), "element size must match");

That's why I didn't add the check in predicate. So do you think it's enough?

This comment has been minimized.

@iwanowww

iwanowww Apr 14, 2021

IMO AD instructions should also be accompanied either by predicates or asserts which stress the conditions for the no-op implementation.

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 15, 2021
Author Contributor

OK, thanks! I will add the type restrict in the predicate.

ins_cost(0);
format %{ "vmaskcast $dst\t# empty (sve)" %}
ins_encode %{
// empty
%}
ins_pipe(pipe_class_empty);
%}

@@ -4199,7 +4199,8 @@ bool MatchRule::is_vector() const {
"FmaVD", "FmaVF","PopCountVI",
// Next are not supported currently.
"PackB","PackS","PackI","PackL","PackF","PackD","Pack2L","Pack2D",
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD"
"ExtractB","ExtractUB","ExtractC","ExtractS","ExtractI","ExtractL","ExtractF","ExtractD",
"VectorMaskCast"
};
int cnt = sizeof(vector_list)/sizeof(char*);
if (_rChild) {
@@ -446,6 +446,7 @@ macro(VectorBoxAllocate)
macro(VectorUnbox)
macro(VectorMaskWrapper)
macro(VectorMaskCmp)
macro(VectorMaskCast)
macro(VectorTest)
macro(VectorBlend)
macro(VectorRearrange)
@@ -803,11 +803,6 @@ class TypeVect : public Type {
virtual const Type *xmeet( const Type *t) const;
virtual const Type *xdual() const; // Compute dual right now.

bool mask_compatible(const TypeVect* t) const {
return base() == t->base() && length() == t->length() &&
type2aelembytes(element_basic_type()) == type2aelembytes(t->element_basic_type());
}

static const TypeVect *VECTA;
static const TypeVect *VECTS;
static const TypeVect *VECTD;
@@ -972,21 +972,10 @@ ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, Basi
}

Node* VectorLoadMaskNode::Identity(PhaseGVN* phase) {
const TypeVect* out_type = type()->is_vect();
BasicType out_bt = out_type->element_basic_type();
BasicType out_bt = type()->is_vect()->element_basic_type();
if (out_bt == T_BOOLEAN) {
return in(1); // redundant conversion
}

// "VectorLoadMask (VectorStoreMask vmask) ==> vmask" if:
// "vmask" has the same vector length and the same element size
// (i.e. T_FLOAT and T_INT have the same size) with current node.
if (in(1)->Opcode() == Op_VectorStoreMask) {
Node* mask = in(1)->in(1);
if (out_type->mask_compatible(mask->bottom_type()->is_vect())) {
return mask;
}
}
return this;
}

@@ -1243,6 +1232,11 @@ Node* VectorUnboxNode::Ideal(PhaseGVN* phase, bool can_reshape) {
bool is_vector_mask = vbox_klass->is_subclass_of(ciEnv::current()->vector_VectorMask_klass());
bool is_vector_shuffle = vbox_klass->is_subclass_of(ciEnv::current()->vector_VectorShuffle_klass());
if (is_vector_mask) {
if (in_vt->length_in_bytes() == out_vt->length_in_bytes() &&
Matcher::match_rule_supported(Op_VectorMaskCast)) {
This conversation was marked as resolved by XiaohongGong

This comment has been minimized.

@iwanowww

iwanowww Apr 9, 2021

There's Matcher::match_rule_supported_vector() specifically for vector nodes.

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 12, 2021
Author Contributor

OK, I will use this instead. Thanks!

// VectorUnbox (VectorBox vmask) ==> VectorMaskCast (vmask)

This comment has been minimized.

@iwanowww

iwanowww Apr 9, 2021

It's better to implement it as a 2-step transformation and place it in VectorLoadMaskNode::Ideal():

VectorUnbox (VectorBox vmask) ==> VectorLoadMask (VectorStoreMask vmask) => VectorMaskCast (vmask)

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 12, 2021
Author Contributor

Thanks for your comments. Yes, theoretically it's better to place it in VectorLoadMaskNode::Ideal(). Unfortunately, we met an issue that is related to optimization for VectorStoreMask. Considering the following case:

   LoadVector                                              LoadVector
           |                                                    |
   VectorLoadMask  (double)                          VectorLoadMask (double)
               |                                               |
           VectorUnbox   (long)          ==>           VectorStoreMask  (double)
                                                               |
                                                      VectorLoadMask  (long)

This case loads the masking values for a double type, and does a bitwise and operation. Since the type is mismatched, the compiler will try to do VectorUnbox (VectorBox vmask) ==> VectorLoadMask (VectorStoreMask vmask). However, since there is the transformation VectorStoreMask (VectorLoadMask value) ==> value, the above VectorStoreMaskNode will be optimized out. The final graph looks like:

   LoadVector                                                  LoadVector
          |                                                      /      \
        VectorLoadMask  (double)                               /         \
              |                      ==>     VectorLoadMask (double)      \
         VectorStoreMask (double)                                   VectorLoadMask (long)
                   |
            VectorLoadMask (long)

Since the two VectorLoadMaskNode have different element type, the GVN cannot optimize out one. So finally there will be two similar VectorLoadMaskNodes. That's also why I override the cmp/hash for VectorLoadMaskNode in the first version.

So I prefer to add VectorUnbox (VectorBox vmask) ==> VectorMaskCast (vmask) directly.

This comment has been minimized.

@iwanowww

iwanowww Apr 12, 2021

Ok, so you face a transformation ordering problem here.

By working on VectorUnbox (VectorBox vmask) you effectively delay VectorStoreMask (VectorLoadMask vmask) => vmask transformation.

As an alternative you could:

(1) check for VectorLoadMask users before applying VectorStoreMask (VectorLoadMask vmask) => vmask;

(2) nest adjacent casts:

VectorLoadMask #double (1 LoadVector)
VectorLoadMask #long   (1 LoadVector)
==>
VectorMaskCast #long (VectorLoadMask #double (1 LoadVector)

The latter looks more powerful (and hence preferrable), but I'm fine with what you have right now. (It can be enhanced later.)
Please, leave a comment describing the motivation for doing the transformation directly on VectorUnbox (VectorBox ...).

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 13, 2021
Author Contributor

Thanks for your alternative advice! I prefer to keep the code as it is right now. Also the comments have been added. Thanks!

return new VectorMaskCastNode(value, out_vt);
}
// VectorUnbox (VectorBox vmask) ==> VectorLoadMask (VectorStoreMask vmask)
value = phase->transform(VectorStoreMaskNode::make(*phase, value, in_vt->element_basic_type(), in_vt->length()));
return new VectorLoadMaskNode(value, out_vt);
@@ -1224,17 +1224,6 @@ class VectorLoadMaskNode : public VectorNode {

virtual int Opcode() const;
virtual Node* Identity(PhaseGVN* phase);
virtual bool cmp( const Node &n ) const {
// The vector types for mask nodes can be treated equal if they are compatible.
return vect_type()->mask_compatible(((VectorLoadMaskNode&)n).vect_type());
}

virtual uint hash() const {
// Compute the hash value based on the vector length and element size
// for mask nodes. Note that the same element size can be verified with
// the same length and same length in bytes.
return Node::hash() + (intptr_t)length_in_bytes() + (intptr_t)length();
}
};

class VectorStoreMaskNode : public VectorNode {
@@ -1248,6 +1237,17 @@ class VectorStoreMaskNode : public VectorNode {
static VectorStoreMaskNode* make(PhaseGVN& gvn, Node* in, BasicType in_type, uint num_elem);
};

class VectorMaskCastNode : public VectorNode {

This comment has been minimized.

@jatin-bhateja

jatin-bhateja Apr 13, 2021
Member

VectorMaskReinterpret seems better choice, since its a re-interpretation and not a casting (up/down).

This comment has been minimized.

@iwanowww

iwanowww Apr 13, 2021

Considering masks have platform-specific representation, full-blown casts between different element types look more appropriate here.

In this particular case, the focus is on the cheapest possible case when representations share the same bit pattern and the cast degenerates into a no-op. But in the longer term, it makes perfect sense to support the full matrix of conversions and don't rely on VectorLoadMask <=> VectorStoreMask and intermediate canonical vector representation.

This comment has been minimized.

public:
VectorMaskCastNode(Node* in, const TypeVect* vt) : VectorNode(in, vt) {
const TypeVect* in_vt = in->bottom_type()->is_vect();
assert(in_vt->length() == vt->length(), "vector length must match");
assert(type2aelembytes(in_vt->element_basic_type()) == type2aelembytes(vt->element_basic_type()), "element size must match");
}

This conversation was marked as resolved by XiaohongGong

This comment has been minimized.

@iwanowww

iwanowww Apr 9, 2021

FTR there's a way to avoid platform-specific changes (in AD files) if we don't envision any non-trivial conversions to be supported: just disable matching of the node by overriding Node::ideal_reg():

virtual uint ideal_reg() const { return 0; } // not matched in the AD file 

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 12, 2021
Author Contributor

Good idea, I will try this way. Thanks!

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 12, 2021
Author Contributor

I met the bad AD file issue when I remove the changes in AD files and overriding Node::ideal_reg(). I guess if a node is not used as an input of other node, this can work well? For the VectorMaskCastNode, it must be an input for some other nodes. If just disable the matching of this node, how does its usage find the right input code?

This comment has been minimized.

@iwanowww

iwanowww Apr 12, 2021

Yeah, ideal_reg() { return 0; } won't work here. Sorry for the misleading suggestion.

This comment has been minimized.

@XiaohongGong

XiaohongGong Apr 13, 2021
Author Contributor

That's ok. Thanks!

virtual int Opcode() const;
};

// This is intended for use as a simple reinterpret node that has no cast.
class VectorReinterpretNode : public VectorNode {
private:
@@ -1877,6 +1877,7 @@ typedef HashtableEntry<InstanceKlass*, mtClass> KlassHashtableEntry;
declare_c2_type(VectorInsertNode, VectorNode) \
declare_c2_type(VectorUnboxNode, VectorNode) \
declare_c2_type(VectorReinterpretNode, VectorNode) \
declare_c2_type(VectorMaskCastNode, VectorNode) \
declare_c2_type(VectorBoxNode, Node) \
declare_c2_type(VectorBoxAllocateNode, CallStaticJavaNode) \
declare_c2_type(VectorTestNode, Node) \
ProTip! Use n and p to navigate between commits in a pull request.