From 7f9641e382b9c10fd21eba78182cb39690732146 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Mar 2023 08:29:08 +0100 Subject: [PATCH 01/19] 8302652: [SuperWord] Reduction should happen after loop, when possible --- src/hotspot/share/opto/vectorIntrinsics.cpp | 2 +- src/hotspot/share/opto/vectornode.cpp | 91 ++++++++++++++++++- src/hotspot/share/opto/vectornode.hpp | 46 ++++++++-- .../loopopts/superword/ReductionPerf.java | 25 +++-- 4 files changed, 143 insertions(+), 21 deletions(-) diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index ae0315c61c767..cc23c7d0c8cab 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1702,7 +1702,7 @@ bool LibraryCallKit::inline_vector_reduction() { } } - Node* init = ReductionNode::make_reduction_input(gvn(), opc, elem_bt); + Node* init = ReductionNode::make_reduction_input_from_scalar_opc(gvn(), opc, elem_bt); Node* value = nullptr; if (mask == nullptr) { assert(!is_masked_op, "Masked op needs the mask value never null"); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 49a5bee32071b..5a7f1dfce8064 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1299,6 +1299,89 @@ Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) { return nullptr; } +Node* UnorderedReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) { + Node* n = ReductionNode::Ideal(phase, can_reshape); + if (n != nullptr) { + return n; + } + if (can_reshape) { + // Having a ReductionNode in the loop is expensive. It needs to recursively + // fold together the vector values, for every vectorized loop iteration. If + // we encounter the following pattern, we can move the UnorderedReduction + // outside the loop. + // + // CountedLoop init + // | | + // +------+ | +---------------+ + // | | | | + // PhiNode (s) Vector | + // | | | + // UnorderedReduction | + // | | + // +-------------+ + // + // We patch the graph to look like this: + // + // CountedLoop neutral_vector + // | | + // +-------+ | +---------------+ + // | | | | + // PhiNode (v) Vector | + // | | | + // init VectorAccumulator | + // | | | | + // UnorderedReduction +-----------+ + // + // We turned the scalar (s) Phi into a vectorized one (v). In the loop, we + // use a vector_accumulator, which does the same reduction, but only element + // wise. This is a single operation, rather than many for the ReductionNode. + // We can then reduce that vector_accumulator after the loop, and also reduce + // the init value into it. + // We can not do this with all reductions. Some reductions do not allow the + // reordering of operations (for example float addition). + Node* ctrl = in(0); + Node* in1 = in(1); + Node* in2 = in(2); + if (ctrl == nullptr && + in1 != nullptr && in1->is_Phi() && in1->in(2) == this && in1->outcnt() == 1 && + in1->in(0)->is_CountedLoop() && + in2->is_Vector()) { + // Find the relevant old nodes + VectorNode* vector = in2->as_Vector(); + PhiNode* phi = in1->as_Phi(); + CountedLoopNode* loop = phi->in(0)->as_CountedLoop(); + Node* init = phi->in(1); + // Determine types + BasicType bt = vect_type()->element_basic_type(); + const Type* bt_t = Type::get_const_basic_type(bt); + // Create vector of neutral elements (zero for add, one for mul, etc) + Node* neutral_scalar = ReductionNode::make_reduction_input_from_vector_opc(*phase, Opcode(), bt); + neutral_scalar = phase->transform(neutral_scalar); + Node* neutral_vector = VectorNode::scalar2vector(neutral_scalar, vector->length(), bt_t); + const TypeVect* vec_t = neutral_vector->as_Vector()->vect_type(); + neutral_vector = phase->transform(neutral_vector); + // Build vector Phi + Node* phi_vector = new PhiNode(loop, vec_t); + phase->set_type(phi_vector, vec_t); + // Start loop with neutral element + phi_vector->set_req(1, neutral_vector); + // In each iteration, do vector accumulation + Node* vector_accumulator = make_normal_vector_op(phi_vector, vector, vec_t); + phase->set_type(vector_accumulator, vec_t); + vector_accumulator = phase->transform(vector_accumulator); + // And feed that into the vector Phi for the next iteration + phi_vector->set_req(2, vector_accumulator); + phi_vector = phase->transform(phi_vector); + // After the loop, we can reduce the init and vector_accumulator + set_req_X(1, init, phase); + set_req_X(2, vector_accumulator, phase); + assert(phi->outcnt() == 0, "scalar phi is unused"); + return this; + } + } + return nullptr; +} + Node* VectorLoadMaskNode::Identity(PhaseGVN* phase) { BasicType out_bt = type()->is_vect()->element_basic_type(); if (!Matcher::has_predicated_vectors() && out_bt == T_BOOLEAN) { @@ -1397,10 +1480,16 @@ Node* VectorCastNode::Identity(PhaseGVN* phase) { return this; } -Node* ReductionNode::make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt) { +// Input opc of pre-reduction operation, eg AddI for AddReductionVI +Node* ReductionNode::make_reduction_input_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt) { int vopc = opcode(opc, bt); guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]); + return make_reduction_input_from_scalar_opc(gvn, vopc, bt); +} + +// Input opc of vector reduction, eg. AddReductionVI +Node* ReductionNode::make_reduction_input_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt) { switch (vopc) { case Op_AndReductionV: switch (bt) { diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index da6d500f4d4d5..71716dfaf2625 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -25,6 +25,8 @@ #define SHARE_OPTO_VECTORNODE_HPP #include "opto/callnode.hpp" +#include "opto/cfgnode.hpp" +#include "opto/loopnode.hpp" #include "opto/matcher.hpp" #include "opto/memnode.hpp" #include "opto/node.hpp" @@ -194,7 +196,10 @@ class ReductionNode : public Node { static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); - static Node* make_reduction_input(PhaseGVN& gvn, int opc, BasicType bt); + // Input opc of pre-reduction operation, eg. AddI for AddReductionVI + static Node* make_reduction_input_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt); + // Input vopc of vector reduction, eg. AddReductionVI + static Node* make_reduction_input_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt); virtual const Type* bottom_type() const { return _bottom_type; @@ -214,20 +219,37 @@ class ReductionNode : public Node { virtual uint size_of() const { return sizeof(*this); } }; +//---------------------------UnorderedReductionNode------------------------------------- +// Order of reduction does not matter. Example int add. Not true for float add. +class UnorderedReductionNode : public ReductionNode { +public: + UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + // Reduction loops can move this more expensive node outside the loop. + virtual Node* Ideal(PhaseGVN* phase, bool can_reshape); + + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) = 0; +}; + //------------------------------AddReductionVINode-------------------------------------- // Vector add byte, short and int as a reduction -class AddReductionVINode : public ReductionNode { +class AddReductionVINode : public UnorderedReductionNode { public: - AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new AddVINode(in1, in2, vt); + } }; //------------------------------AddReductionVLNode-------------------------------------- // Vector add long as a reduction -class AddReductionVLNode : public ReductionNode { +class AddReductionVLNode : public UnorderedReductionNode { public: - AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new AddVLNode(in1, in2, vt); + } }; //------------------------------AddReductionVFNode-------------------------------------- @@ -384,18 +406,24 @@ class CMoveVDNode : public VectorNode { //------------------------------MulReductionVINode-------------------------------------- // Vector multiply byte, short and int as a reduction -class MulReductionVINode : public ReductionNode { +class MulReductionVINode : public UnorderedReductionNode { public: - MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new MulVINode(in1, in2, vt); + } }; //------------------------------MulReductionVLNode-------------------------------------- // Vector multiply int as a reduction -class MulReductionVLNode : public ReductionNode { +class MulReductionVLNode : public UnorderedReductionNode { public: - MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new MulVLNode(in1, in2, vt); + } }; //------------------------------MulReductionVFNode-------------------------------------- diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java index d96d5e29c0070..4e6018e275385 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java @@ -43,6 +43,12 @@ public class ReductionPerf { public static void main(String[] args) throws Exception { + + // Increase the values to 10_000 and 100_000 for more serious benchmarking + // with lower variance. + int iterations_warmup = 2_000; + int iterations_perf = 5_000; + int[] a1 = new int[8 * 1024]; int[] a2 = new int[8 * 1024]; int[] a3 = new int[8 * 1024]; @@ -79,7 +85,7 @@ public static void main(String[] args) throws Exception { System.out.println("Warmup ..."); long start = System.currentTimeMillis(); - for (int j = 0; j < 2000; j++) { + for (int j = 0; j < iterations_warmup; j++) { sumI = sumInt(a1, a2, a3); sumL = sumLong(b1, b2, b3); sumF = sumFloat(c1, c2, c3); @@ -119,61 +125,60 @@ public static void main(String[] args) throws Exception { } start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { sumI = sumInt(a1, a2, a3); } stop = System.currentTimeMillis(); System.out.println("sum int: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { sumL = sumLong(b1, b2, b3); } stop = System.currentTimeMillis(); System.out.println("sum long: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { sumF = sumFloat(c1, c2, c3); } stop = System.currentTimeMillis(); System.out.println("sum float: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { sumD = sumDouble(d1, d2, d3); } stop = System.currentTimeMillis(); System.out.println("sum double: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { mulI = prodInt(a1, a2, a3); } stop = System.currentTimeMillis(); System.out.println("prod int: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { mulL = prodLong(b1, b2, b3); } stop = System.currentTimeMillis(); System.out.println("prod long: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { mulF = prodFloat(c1, c2, c3); } stop = System.currentTimeMillis(); System.out.println("prod float: " + (stop - start)); start = System.currentTimeMillis(); - for (int j = 0; j < 5000; j++) { + for (int j = 0; j < iterations_perf; j++) { mulD = prodDouble(d1, d2, d3); } stop = System.currentTimeMillis(); System.out.println("prod double: " + (stop - start)); - } public static void ReductionInit(int[] a1, int[] a2, int[] a3, From 7b551b6f0618262206deed0b600b93f04c594f18 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 16 Mar 2023 12:34:46 +0100 Subject: [PATCH 02/19] fix typo --- src/hotspot/share/opto/vectornode.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 5a7f1dfce8064..e46de72a9a7f3 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1485,7 +1485,7 @@ Node* ReductionNode::make_reduction_input_from_scalar_opc(PhaseGVN& gvn, int opc int vopc = opcode(opc, bt); guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]); - return make_reduction_input_from_scalar_opc(gvn, vopc, bt); + return make_reduction_input_from_vector_opc(gvn, vopc, bt); } // Input opc of vector reduction, eg. AddReductionVI From 83988de1a4565c75ea9d34d61325b84de006b13d Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 21 Mar 2023 15:37:38 +0100 Subject: [PATCH 03/19] fix for and, or, xor, min, max --- src/hotspot/share/opto/vectornode.hpp | 47 ++++++++++++++++++--------- 1 file changed, 31 insertions(+), 16 deletions(-) diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 71716dfaf2625..da5faeb0bcf69 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -763,10 +763,13 @@ class AndVNode : public VectorNode { //------------------------------AndReductionVNode-------------------------------------- // Vector and byte, short, int, long as a reduction -class AndReductionVNode : public ReductionNode { +class AndReductionVNode : public UnorderedReductionNode { public: - AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new AndVNode(in1, in2, vt); + } }; //------------------------------OrVNode--------------------------------------- @@ -780,18 +783,13 @@ class OrVNode : public VectorNode { //------------------------------OrReductionVNode-------------------------------------- // Vector xor byte, short, int, long as a reduction -class OrReductionVNode : public ReductionNode { +class OrReductionVNode : public UnorderedReductionNode { public: - OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} - virtual int Opcode() const; -}; - -//------------------------------XorReductionVNode-------------------------------------- -// Vector and int, long as a reduction -class XorReductionVNode : public ReductionNode { - public: - XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new OrVNode(in1, in2, vt); + } }; //------------------------------XorVNode--------------------------------------- @@ -803,20 +801,37 @@ class XorVNode : public VectorNode { virtual Node* Ideal(PhaseGVN* phase, bool can_reshape); }; +//------------------------------XorReductionVNode-------------------------------------- +// Vector and int, long as a reduction +class XorReductionVNode : public UnorderedReductionNode { + public: + XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} + virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new XorVNode(in1, in2, vt); + } +}; + //------------------------------MinReductionVNode-------------------------------------- // Vector min byte, short, int, long, float, double as a reduction -class MinReductionVNode : public ReductionNode { +class MinReductionVNode : public UnorderedReductionNode { public: - MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new MinVNode(in1, in2, vt); + } }; //------------------------------MaxReductionVNode-------------------------------------- // Vector min byte, short, int, long, float, double as a reduction -class MaxReductionVNode : public ReductionNode { +class MaxReductionVNode : public UnorderedReductionNode { public: - MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} + MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; + virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { + return new MaxVNode(in1, in2, vt); + } }; //------------------------------CompressVNode-------------------------------------- From e108ae5bf944c88148757c71c3828bf92d9f9b4a Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 21 Mar 2023 18:47:14 +0100 Subject: [PATCH 04/19] added IR rules to validate reduced use of Reduce node --- .../jtreg/compiler/loopopts/superword/ProdRed_Int.java | 2 +- .../jtreg/compiler/loopopts/superword/RedTest_int.java | 10 +++++----- .../compiler/loopopts/superword/RedTest_long.java | 10 +++++----- .../jtreg/compiler/loopopts/superword/SumRed_Int.java | 2 +- .../jtreg/compiler/loopopts/superword/SumRed_Long.java | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java index ab7f83c18a3ac..17f3a97a8e84d 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/ProdRed_Int.java @@ -84,7 +84,7 @@ public static void prodReductionInit(int[] a, int[] b) { failOn = {IRNode.MUL_REDUCTION_VI}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.MUL_REDUCTION_VI, ">= 1"}) + counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int prodReductionImplement(int[] a, int[] b, int total) { for (int i = 0; i < a.length; i++) { total *= a[i] + b[i]; diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java index 9beea472adcd5..faece0cbf9f3f 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_int.java @@ -134,7 +134,7 @@ public static void reductionInit2( failOn = {IRNode.ADD_REDUCTION_VI}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.ADD_REDUCTION_VI, ">= 1"}) + counts = {IRNode.ADD_REDUCTION_VI, ">= 1", IRNode.ADD_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int sumReductionImplement( int[] a, int[] b, @@ -151,7 +151,7 @@ public static int sumReductionImplement( failOn = {IRNode.OR_REDUCTION_V}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.OR_REDUCTION_V, ">= 1"}) + counts = {IRNode.OR_REDUCTION_V, ">= 1", IRNode.OR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static int orReductionImplement( int[] a, int[] b, @@ -168,7 +168,7 @@ public static int orReductionImplement( failOn = {IRNode.AND_REDUCTION_V}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.AND_REDUCTION_V, ">= 1"}) + counts = {IRNode.AND_REDUCTION_V, ">= 1", IRNode.AND_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static int andReductionImplement( int[] a, int[] b, @@ -185,7 +185,7 @@ public static int andReductionImplement( failOn = {IRNode.XOR_REDUCTION_V}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.XOR_REDUCTION_V, ">= 1"}) + counts = {IRNode.XOR_REDUCTION_V, ">= 1", IRNode.XOR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static int xorReductionImplement( int[] a, int[] b, @@ -202,7 +202,7 @@ public static int xorReductionImplement( failOn = {IRNode.MUL_REDUCTION_VI}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.MUL_REDUCTION_VI, ">= 1"}) + counts = {IRNode.MUL_REDUCTION_VI, ">= 1", IRNode.MUL_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int mulReductionImplement( int[] a, int[] b, diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java index 59d612d015977..27bfa8cec0ebb 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/RedTest_long.java @@ -137,7 +137,7 @@ public static void reductionInit2( failOn = {IRNode.ADD_REDUCTION_VL}) @IR(applyIfCPUFeature = {"avx2", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.ADD_REDUCTION_VL, ">= 1"}) + counts = {IRNode.ADD_REDUCTION_VL, ">= 1", IRNode.ADD_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop public static long sumReductionImplement( long[] a, long[] b, @@ -154,7 +154,7 @@ public static long sumReductionImplement( failOn = {IRNode.OR_REDUCTION_V}) @IR(applyIfCPUFeature = {"avx2", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.OR_REDUCTION_V, ">= 1"}) + counts = {IRNode.OR_REDUCTION_V, ">= 1", IRNode.OR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static long orReductionImplement( long[] a, long[] b, @@ -171,7 +171,7 @@ public static long orReductionImplement( failOn = {IRNode.AND_REDUCTION_V}) @IR(applyIfCPUFeature = {"avx2", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.AND_REDUCTION_V, ">= 1"}) + counts = {IRNode.AND_REDUCTION_V, ">= 1", IRNode.AND_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static long andReductionImplement( long[] a, long[] b, @@ -188,7 +188,7 @@ public static long andReductionImplement( failOn = {IRNode.XOR_REDUCTION_V}) @IR(applyIfCPUFeature = {"avx2", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.XOR_REDUCTION_V, ">= 1"}) + counts = {IRNode.XOR_REDUCTION_V, ">= 1", IRNode.XOR_REDUCTION_V, "<= 2"}) // one for main-loop, one for vector-post-loop public static long xorReductionImplement( long[] a, long[] b, @@ -205,7 +205,7 @@ public static long xorReductionImplement( failOn = {IRNode.MUL_REDUCTION_VL}) @IR(applyIfCPUFeature = {"avx512dq", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.MUL_REDUCTION_VL, ">= 1"}) + counts = {IRNode.MUL_REDUCTION_VL, ">= 1", IRNode.MUL_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop public static long mulReductionImplement( long[] a, long[] b, diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Int.java b/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Int.java index 77eebd0eea30c..ad6d9e45051f2 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Int.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Int.java @@ -91,7 +91,7 @@ public static void sumReductionInit( failOn = {IRNode.ADD_REDUCTION_VI}) @IR(applyIfCPUFeature = {"sse4.1", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.ADD_REDUCTION_VI, ">= 1"}) + counts = {IRNode.ADD_REDUCTION_VI, ">= 1", IRNode.ADD_REDUCTION_VI, "<= 2"}) // one for main-loop, one for vector-post-loop public static int sumReductionImplement( int[] a, int[] b, diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Long.java b/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Long.java index 278c81f707cb2..ea41a652940de 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Long.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/SumRed_Long.java @@ -95,7 +95,7 @@ public static void sumReductionInit( failOn = {IRNode.ADD_REDUCTION_VL}) @IR(applyIfCPUFeature = {"avx2", "true"}, applyIfAnd = {"SuperWordReductions", "true", "LoopMaxUnroll", ">= 8"}, - counts = {IRNode.ADD_REDUCTION_VL, ">= 1"}) + counts = {IRNode.ADD_REDUCTION_VL, ">= 1", IRNode.ADD_REDUCTION_VL, "<= 2"}) // one for main-loop, one for vector-post-loop public static long sumReductionImplement( long[] a, long[] b, From 9391f99d8dcd4a018b6a9b1b94e1a07387bc1073 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 21 Mar 2023 19:07:06 +0100 Subject: [PATCH 05/19] pushed updated ReductionPerf.java --- .../loopopts/superword/ReductionPerf.java | 659 +++++++++++++----- 1 file changed, 490 insertions(+), 169 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java index 4e6018e275385..19121639fbd22 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java @@ -1,5 +1,5 @@ /* - * Copyright (c) 2015, 2022, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2023, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it @@ -21,247 +21,568 @@ * questions. */ -/** +/* * @test - * @bug 8074981 - * @summary Add C2 x86 Superword support for scalar product reduction optimizations : int test - * @requires os.arch=="x86" | os.arch=="i386" | os.arch=="amd64" | os.arch=="x86_64" | os.arch=="aarch64" | os.arch=="riscv64" - * - * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions - * -XX:LoopUnrollLimit=250 -XX:CompileThresholdScaling=0.1 - * -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main - * -XX:+SuperWordReductions - * compiler.loopopts.superword.ReductionPerf - * @run main/othervm -XX:+IgnoreUnrecognizedVMOptions - * -XX:LoopUnrollLimit=250 -XX:CompileThresholdScaling=0.1 - * -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main - * -XX:-SuperWordReductions - * compiler.loopopts.superword.ReductionPerf + * @bug 8074981 8302652 + * @summary Test SuperWord Reduction Perf. + * @requires vm.compiler2.enabled + * @library /test/lib / + * @run main/othervm -XX:LoopUnrollLimit=250 + * -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main + * compiler.loopopts.superword.ReductionPerf */ package compiler.loopopts.superword; +import java.util.Random; +import jdk.test.lib.Utils; public class ReductionPerf { - public static void main(String[] args) throws Exception { - - // Increase the values to 10_000 and 100_000 for more serious benchmarking - // with lower variance. - int iterations_warmup = 2_000; - int iterations_perf = 5_000; - - int[] a1 = new int[8 * 1024]; - int[] a2 = new int[8 * 1024]; - int[] a3 = new int[8 * 1024]; - long[] b1 = new long[8 * 1024]; - long[] b2 = new long[8 * 1024]; - long[] b3 = new long[8 * 1024]; - float[] c1 = new float[8 * 1024]; - float[] c2 = new float[8 * 1024]; - float[] c3 = new float[8 * 1024]; - double[] d1 = new double[8 * 1024]; - double[] d2 = new double[8 * 1024]; - double[] d3 = new double[8 * 1024]; - - ReductionInit(a1, a2, a3, b1, b2, b3, c1, c2, c3, d1, d2, d3); - - int sumIv = sumInt(a1, a2, a3); - long sumLv = sumLong(b1, b2, b3); - float sumFv = sumFloat(c1, c2, c3); - double sumDv = sumDouble(d1, d2, d3); - int mulIv = prodInt(a1, a2, a3); - long mulLv = prodLong(b1, b2, b3); - float mulFv = prodFloat(c1, c2, c3); - double mulDv = prodDouble(d1, d2, d3); - - int sumI = 0; - long sumL = 0; - float sumF = 0.f; - double sumD = 0.; - int mulI = 0; - long mulL = 0; - float mulF = 0.f; - double mulD = 0.; - - System.out.println("Warmup ..."); - long start = System.currentTimeMillis(); - - for (int j = 0; j < iterations_warmup; j++) { - sumI = sumInt(a1, a2, a3); - sumL = sumLong(b1, b2, b3); - sumF = sumFloat(c1, c2, c3); - sumD = sumDouble(d1, d2, d3); - mulI = prodInt(a1, a2, a3); - mulL = prodLong(b1, b2, b3); - mulF = prodFloat(c1, c2, c3); - mulD = prodDouble(d1, d2, d3); - } - - long stop = System.currentTimeMillis(); - System.out.println(" Warmup is done in " + (stop - start) + " msec"); - - if (sumIv != sumI) { - System.out.println("sum int: " + sumIv + " != " + sumI); - } - if (sumLv != sumL) { - System.out.println("sum long: " + sumLv + " != " + sumL); - } - if (sumFv != sumF) { - System.out.println("sum float: " + sumFv + " != " + sumF); - } - if (sumDv != sumD) { - System.out.println("sum double: " + sumDv + " != " + sumD); - } - if (mulIv != mulI) { - System.out.println("prod int: " + mulIv + " != " + mulI); - } - if (mulLv != mulL) { - System.out.println("prod long: " + mulLv + " != " + mulL); - } - if (mulFv != mulF) { - System.out.println("prod float: " + mulFv + " != " + mulF); - } - if (mulDv != mulD) { - System.out.println("prod double: " + mulDv + " != " + mulD); + static final int RANGE = 8192; + static Random rand = Utils.getRandomInstance(); + + public static void main(String args[]) { + int iter_warmup = 2_000; + int iter_perf = 5_000; + + float[] aFloat = new float[RANGE]; + float[] bFloat = new float[RANGE]; + float[] cFloat = new float[RANGE]; + double[] aDouble = new double[RANGE]; + double[] bDouble = new double[RANGE]; + double[] cDouble = new double[RANGE]; + long[] aLong = new long[RANGE]; + long[] bLong = new long[RANGE]; + long[] cLong = new long[RANGE]; + int[] aInt = new int[RANGE]; + int[] bInt = new int[RANGE]; + int[] cInt = new int[RANGE]; + + long start, stop; + + int startIntAdd = init(aInt, bInt, cInt); + int goldIntAdd = testIntAdd(aInt, bInt, cInt, startIntAdd); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntAdd(aInt, bInt, cInt, startIntAdd); + verify("int add", total, goldIntAdd); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntAdd(aInt, bInt, cInt, startIntAdd); + } + stop = System.currentTimeMillis(); + System.out.println("int add " + (stop - start)); + + int startIntMul = init(aInt, bInt, cInt); + int goldIntMul = testIntMul(aInt, bInt, cInt, startIntMul); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntMul(aInt, bInt, cInt, startIntMul); + verify("int mul", total, goldIntMul); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntMul(aInt, bInt, cInt, startIntMul); + } + stop = System.currentTimeMillis(); + System.out.println("int mul " + (stop - start)); + + int startIntMin = init(aInt, bInt, cInt); + int goldIntMin = testIntMin(aInt, bInt, cInt, startIntMin); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntMin(aInt, bInt, cInt, startIntMin); + verify("int min", total, goldIntMin); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntMin(aInt, bInt, cInt, startIntMin); + } + stop = System.currentTimeMillis(); + System.out.println("int min " + (stop - start)); + + int startIntMax = init(aInt, bInt, cInt); + int goldIntMax = testIntMax(aInt, bInt, cInt, startIntMax); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntMax(aInt, bInt, cInt, startIntMax); + verify("int max", total, goldIntMax); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntMax(aInt, bInt, cInt, startIntMax); + } + stop = System.currentTimeMillis(); + System.out.println("int max " + (stop - start)); + + int startIntAnd = init(aInt, bInt, cInt); + int goldIntAnd = testIntAnd(aInt, bInt, cInt, startIntAnd); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntAnd(aInt, bInt, cInt, startIntAnd); + verify("int and", total, goldIntAnd); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntAnd(aInt, bInt, cInt, startIntAnd); + } + stop = System.currentTimeMillis(); + System.out.println("int and " + (stop - start)); + + int startIntOr = init(aInt, bInt, cInt); + int goldIntOr = testIntOr(aInt, bInt, cInt, startIntOr); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntOr(aInt, bInt, cInt, startIntOr); + verify("int or", total, goldIntOr); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntOr(aInt, bInt, cInt, startIntOr); + } + stop = System.currentTimeMillis(); + System.out.println("int or " + (stop - start)); + + int startIntXor = init(aInt, bInt, cInt); + int goldIntXor = testIntXor(aInt, bInt, cInt, startIntXor); + for (int j = 0; j < iter_warmup; j++) { + int total = testIntXor(aInt, bInt, cInt, startIntXor); + verify("int xor", total, goldIntXor); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testIntXor(aInt, bInt, cInt, startIntXor); + } + stop = System.currentTimeMillis(); + System.out.println("int xor " + (stop - start)); + + long startLongAdd = init(aLong, bLong, cLong); + long goldLongAdd = testLongAdd(aLong, bLong, cLong, startLongAdd); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongAdd(aLong, bLong, cLong, startLongAdd); + verify("long add", total, goldLongAdd); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongAdd(aLong, bLong, cLong, startLongAdd); + } + stop = System.currentTimeMillis(); + System.out.println("long add " + (stop - start)); + + long startLongMul = init(aLong, bLong, cLong); + long goldLongMul = testLongMul(aLong, bLong, cLong, startLongMul); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongMul(aLong, bLong, cLong, startLongMul); + verify("long mul", total, goldLongMul); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongMul(aLong, bLong, cLong, startLongMul); + } + stop = System.currentTimeMillis(); + System.out.println("long mul " + (stop - start)); + + long startLongMin = init(aLong, bLong, cLong); + long goldLongMin = testLongMin(aLong, bLong, cLong, startLongMin); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongMin(aLong, bLong, cLong, startLongMin); + verify("long min", total, goldLongMin); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongMin(aLong, bLong, cLong, startLongMin); + } + stop = System.currentTimeMillis(); + System.out.println("long min " + (stop - start)); + + long startLongMax = init(aLong, bLong, cLong); + long goldLongMax = testLongMax(aLong, bLong, cLong, startLongMax); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongMax(aLong, bLong, cLong, startLongMax); + verify("long max", total, goldLongMax); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongMax(aLong, bLong, cLong, startLongMax); + } + stop = System.currentTimeMillis(); + System.out.println("long max " + (stop - start)); + + long startLongAnd = init(aLong, bLong, cLong); + long goldLongAnd = testLongAnd(aLong, bLong, cLong, startLongAnd); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongAnd(aLong, bLong, cLong, startLongAnd); + verify("long and", total, goldLongAnd); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongAnd(aLong, bLong, cLong, startLongAnd); + } + stop = System.currentTimeMillis(); + System.out.println("long and " + (stop - start)); + + long startLongOr = init(aLong, bLong, cLong); + long goldLongOr = testLongOr(aLong, bLong, cLong, startLongOr); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongOr(aLong, bLong, cLong, startLongOr); + verify("long or", total, goldLongOr); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongOr(aLong, bLong, cLong, startLongOr); + } + stop = System.currentTimeMillis(); + System.out.println("long or " + (stop - start)); + + long startLongXor = init(aLong, bLong, cLong); + long goldLongXor = testLongXor(aLong, bLong, cLong, startLongXor); + for (int j = 0; j < iter_warmup; j++) { + long total = testLongXor(aLong, bLong, cLong, startLongXor); + verify("long xor", total, goldLongXor); + } + start = System.currentTimeMillis(); + for (int j = 0; j < iter_perf; j++) { + testLongXor(aLong, bLong, cLong, startLongXor); } + stop = System.currentTimeMillis(); + System.out.println("long xor " + (stop - start)); + float startFloatAdd = init(aFloat, bFloat, cFloat); + float goldFloatAdd = testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd); + for (int j = 0; j < iter_warmup; j++) { + float total = testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd); + verify("float add", total, goldFloatAdd); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - sumI = sumInt(a1, a2, a3); + for (int j = 0; j < iter_perf; j++) { + testFloatAdd(aFloat, bFloat, cFloat, startFloatAdd); } stop = System.currentTimeMillis(); - System.out.println("sum int: " + (stop - start)); + System.out.println("float add " + (stop - start)); + float startFloatMul = init(aFloat, bFloat, cFloat); + float goldFloatMul = testFloatMul(aFloat, bFloat, cFloat, startFloatMul); + for (int j = 0; j < iter_warmup; j++) { + float total = testFloatMul(aFloat, bFloat, cFloat, startFloatMul); + verify("float mul", total, goldFloatMul); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - sumL = sumLong(b1, b2, b3); + for (int j = 0; j < iter_perf; j++) { + testFloatMul(aFloat, bFloat, cFloat, startFloatMul); } stop = System.currentTimeMillis(); - System.out.println("sum long: " + (stop - start)); + System.out.println("float mul " + (stop - start)); + float startFloatMin = init(aFloat, bFloat, cFloat); + float goldFloatMin = testFloatMin(aFloat, bFloat, cFloat, startFloatMin); + for (int j = 0; j < iter_warmup; j++) { + float total = testFloatMin(aFloat, bFloat, cFloat, startFloatMin); + verify("float min", total, goldFloatMin); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - sumF = sumFloat(c1, c2, c3); + for (int j = 0; j < iter_perf; j++) { + testFloatMin(aFloat, bFloat, cFloat, startFloatMin); } stop = System.currentTimeMillis(); - System.out.println("sum float: " + (stop - start)); + System.out.println("float min " + (stop - start)); + float startFloatMax = init(aFloat, bFloat, cFloat); + float goldFloatMax = testFloatMax(aFloat, bFloat, cFloat, startFloatMax); + for (int j = 0; j < iter_warmup; j++) { + float total = testFloatMax(aFloat, bFloat, cFloat, startFloatMax); + verify("float max", total, goldFloatMax); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - sumD = sumDouble(d1, d2, d3); + for (int j = 0; j < iter_perf; j++) { + testFloatMax(aFloat, bFloat, cFloat, startFloatMax); } stop = System.currentTimeMillis(); - System.out.println("sum double: " + (stop - start)); + System.out.println("float max " + (stop - start)); + double startDoubleAdd = init(aDouble, bDouble, cDouble); + double goldDoubleAdd = testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd); + for (int j = 0; j < iter_warmup; j++) { + double total = testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd); + verify("double add", total, goldDoubleAdd); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - mulI = prodInt(a1, a2, a3); + for (int j = 0; j < iter_perf; j++) { + testDoubleAdd(aDouble, bDouble, cDouble, startDoubleAdd); } stop = System.currentTimeMillis(); - System.out.println("prod int: " + (stop - start)); + System.out.println("double add " + (stop - start)); + double startDoubleMul = init(aDouble, bDouble, cDouble); + double goldDoubleMul = testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul); + for (int j = 0; j < iter_warmup; j++) { + double total = testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul); + verify("double mul", total, goldDoubleMul); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - mulL = prodLong(b1, b2, b3); + for (int j = 0; j < iter_perf; j++) { + testDoubleMul(aDouble, bDouble, cDouble, startDoubleMul); } stop = System.currentTimeMillis(); - System.out.println("prod long: " + (stop - start)); + System.out.println("double mul " + (stop - start)); + double startDoubleMin = init(aDouble, bDouble, cDouble); + double goldDoubleMin = testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin); + for (int j = 0; j < iter_warmup; j++) { + double total = testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin); + verify("double min", total, goldDoubleMin); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - mulF = prodFloat(c1, c2, c3); + for (int j = 0; j < iter_perf; j++) { + testDoubleMin(aDouble, bDouble, cDouble, startDoubleMin); } stop = System.currentTimeMillis(); - System.out.println("prod float: " + (stop - start)); + System.out.println("double min " + (stop - start)); + double startDoubleMax = init(aDouble, bDouble, cDouble); + double goldDoubleMax = testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax); + for (int j = 0; j < iter_warmup; j++) { + double total = testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax); + verify("double max", total, goldDoubleMax); + } start = System.currentTimeMillis(); - for (int j = 0; j < iterations_perf; j++) { - mulD = prodDouble(d1, d2, d3); + for (int j = 0; j < iter_perf; j++) { + testDoubleMax(aDouble, bDouble, cDouble, startDoubleMax); } stop = System.currentTimeMillis(); - System.out.println("prod double: " + (stop - start)); + System.out.println("double max " + (stop - start)); + + } + + // ------------------- Tests ------------------- + + static int testIntAdd(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += v; + } + return total; + } + + static int testIntMul(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total *= v; + } + return total; + } + + static int testIntMin(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.min(total, v); + } + return total; + } + + static int testIntMax(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.max(total, v); + } + return total; + } + + static int testIntAnd(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total &= v; + } + return total; + } + + static int testIntOr(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total |= v; + } + return total; + } + + static int testIntXor(int[] a, int[] b, int[] c, int total) { + for (int i = 0; i < RANGE; i++) { + int v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total ^= v; + } + return total; + } + + static long testLongAdd(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += v; + } + return total; + } + + static long testLongMul(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total *= v; + } + return total; } - public static void ReductionInit(int[] a1, int[] a2, int[] a3, - long[] b1, long[] b2, long[] b3, - float[] c1, float[] c2, float[] c3, - double[] d1, double[] d2, double[] d3) { - for(int i = 0; i < a1.length; i++) { - a1[i] = (i + 0); - a2[i] = (i + 1); - a3[i] = (i + 2); - b1[i] = (long) (i + 0); - b2[i] = (long) (i + 1); - b3[i] = (long) (i + 2); - c1[i] = (float) (i + 0); - c2[i] = (float) (i + 1); - c3[i] = (float) (i + 2); - d1[i] = (double) (i + 0); - d2[i] = (double) (i + 1); - d3[i] = (double) (i + 2); + static long testLongMin(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.min(total, v); } + return total; + } + + static long testLongMax(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.max(total, v); + } + return total; + } + + static long testLongAnd(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total &= v; + } + return total; + } + + static long testLongOr(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total |= v; + } + return total; + } + + static long testLongXor(long[] a, long[] b, long[] c, long total) { + for (int i = 0; i < RANGE; i++) { + long v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total ^= v; + } + return total; } - public static int sumInt(int[] a1, int[] a2, int[] a3) { - int total = 0; - for (int i = 0; i < a1.length; i++) { - total += (a1[i] * a2[i]) + (a1[i] * a3[i]) + (a2[i] * a3[i]); + static float testFloatAdd(float[] a, float[] b, float[] c, float total) { + for (int i = 0; i < RANGE; i++) { + float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += v; } return total; } - public static long sumLong(long[] b1, long[] b2, long[] b3) { - long total = 0; - for (int i = 0; i < b1.length; i++) { - total += (b1[i] * b2[i]) + (b1[i] * b3[i]) + (b2[i] * b3[i]); + static float testFloatMul(float[] a, float[] b, float[] c, float total) { + for (int i = 0; i < RANGE; i++) { + float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total *= v; } return total; } - public static float sumFloat(float[] c1, float[] c2, float[] c3) { - float total = 0; - for (int i = 0; i < c1.length; i++) { - total += (c1[i] * c2[i]) + (c1[i] * c3[i]) + (c2[i] * c3[i]); + static float testFloatMin(float[] a, float[] b, float[] c, float total) { + for (int i = 0; i < RANGE; i++) { + float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.min(total, v); } return total; } - public static double sumDouble(double[] d1, double[] d2, double[] d3) { - double total = 0; - for (int i = 0; i < d1.length; i++) { - total += (d1[i] * d2[i]) + (d1[i] * d3[i]) + (d2[i] * d3[i]); + static float testFloatMax(float[] a, float[] b, float[] c, float total) { + for (int i = 0; i < RANGE; i++) { + float v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.max(total, v); } return total; } - public static int prodInt(int[] a1, int[] a2, int[] a3) { - int total = 1; - for (int i = 0; i < a1.length; i++) { - total *= (a1[i] * a2[i]) + (a1[i] * a3[i]) + (a2[i] * a3[i]); + static double testDoubleAdd(double[] a, double[] b, double[] c, double total) { + for (int i = 0; i < RANGE; i++) { + double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total += v; } return total; } - public static long prodLong(long[] b1, long[] b2, long[] b3) { - long total = 1; - for (int i = 0; i < b1.length; i++) { - total *= (b1[i] * b2[i]) + (b1[i] * b3[i]) + (b2[i] * b3[i]); + static double testDoubleMul(double[] a, double[] b, double[] c, double total) { + for (int i = 0; i < RANGE; i++) { + double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total *= v; } return total; } - public static float prodFloat(float[] c1, float[] c2, float[] c3) { - float total = 1; - for (int i = 0; i < c1.length; i++) { - total *= (c1[i] * c2[i]) + (c1[i] * c3[i]) + (c2[i] * c3[i]); + static double testDoubleMin(double[] a, double[] b, double[] c, double total) { + for (int i = 0; i < RANGE; i++) { + double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.min(total, v); } return total; } - public static double prodDouble(double[] d1, double[] d2, double[] d3) { - double total = 1; - for (int i = 0; i < d1.length; i++) { - total *= (d1[i] * d2[i]) + (d1[i] * d3[i]) + (d2[i] * d3[i]); + static double testDoubleMax(double[] a, double[] b, double[] c, double total) { + for (int i = 0; i < RANGE; i++) { + double v = (a[i] * b[i]) + (a[i] * c[i]) + (b[i] * c[i]); + total = Math.max(total, v); } return total; } + + // ------------------- Initialization ------------------- + + static int init(int[] a, int[] b, int[] c) { + for (int j = 0; j < RANGE; j++) { + a[j] = rand.nextInt(); + b[j] = rand.nextInt(); + c[j] = rand.nextInt(); + } + return rand.nextInt(); + } + + static long init(long[] a, long[] b, long[] c) { + for (int j = 0; j < RANGE; j++) { + a[j] = rand.nextLong(); + b[j] = rand.nextLong(); + c[j] = rand.nextLong(); + } + return rand.nextLong(); + } + + static float init(float[] a, float[] b, float[] c) { + for (int j = 0; j < RANGE; j++) { + a[j] = rand.nextFloat(); + b[j] = rand.nextFloat(); + c[j] = rand.nextFloat(); + } + return rand.nextFloat(); + } + + static double init(double[] a, double[] b, double[] c) { + for (int j = 0; j < RANGE; j++) { + a[j] = rand.nextDouble(); + b[j] = rand.nextDouble(); + c[j] = rand.nextDouble(); + } + return rand.nextDouble(); + } + + // ------------------- Verification ------------------- + + static void verify(String context, float total, float gold) { + if (total != gold) { + throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); + } + } + static void verify(String context, double total, double gold) { + if (total != gold) { + throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); + } + } + static void verify(String context, long total, long gold) { + if (total != gold) { + throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); + } + } + static void verify(String context, int total, int gold) { + if (total != gold) { + throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); + } + } } From 3a9ba971b8de78b05031b1328b4021a3c4abf210 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 22 Mar 2023 09:14:32 +0100 Subject: [PATCH 06/19] Vladimir's suggestions for ReductionPerf.java --- .../loopopts/superword/ReductionPerf.java | 28 ++++++++++--------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java index 19121639fbd22..b1495d00548f8 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/ReductionPerf.java @@ -26,8 +26,9 @@ * @bug 8074981 8302652 * @summary Test SuperWord Reduction Perf. * @requires vm.compiler2.enabled + * @requires vm.simpleArch == "x86" | vm.simpleArch == "x64" | vm.simpleArch == "aarch64" | vm.simpleArch == "riscv64" * @library /test/lib / - * @run main/othervm -XX:LoopUnrollLimit=250 + * @run main/othervm -Xbatch -XX:LoopUnrollLimit=250 * -XX:CompileCommand=exclude,compiler.loopopts.superword.ReductionPerf::main * compiler.loopopts.superword.ReductionPerf */ @@ -41,21 +42,22 @@ public class ReductionPerf { static Random rand = Utils.getRandomInstance(); public static void main(String args[]) { - int iter_warmup = 2_000; - int iter_perf = 5_000; + // Please increase iterations for measurement to 2_000 and 100_000. + int iter_warmup = 100; + int iter_perf = 1_000; - float[] aFloat = new float[RANGE]; - float[] bFloat = new float[RANGE]; - float[] cFloat = new float[RANGE]; double[] aDouble = new double[RANGE]; double[] bDouble = new double[RANGE]; double[] cDouble = new double[RANGE]; - long[] aLong = new long[RANGE]; - long[] bLong = new long[RANGE]; - long[] cLong = new long[RANGE]; + float[] aFloat = new float[RANGE]; + float[] bFloat = new float[RANGE]; + float[] cFloat = new float[RANGE]; int[] aInt = new int[RANGE]; int[] bInt = new int[RANGE]; int[] cInt = new int[RANGE]; + long[] aLong = new long[RANGE]; + long[] bLong = new long[RANGE]; + long[] cLong = new long[RANGE]; long start, stop; @@ -565,22 +567,22 @@ static double init(double[] a, double[] b, double[] c) { // ------------------- Verification ------------------- - static void verify(String context, float total, float gold) { + static void verify(String context, double total, double gold) { if (total != gold) { throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); } } - static void verify(String context, double total, double gold) { + static void verify(String context, float total, float gold) { if (total != gold) { throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); } } - static void verify(String context, long total, long gold) { + static void verify(String context, int total, int gold) { if (total != gold) { throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); } } - static void verify(String context, int total, int gold) { + static void verify(String context, long total, long gold) { if (total != gold) { throw new RuntimeException("Wrong result for " + context + ": " + total + " != " + gold); } From ed9e788e4c4a91f05f0b4487bbed4fdaae69f1fc Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 27 Mar 2023 11:16:41 +0200 Subject: [PATCH 07/19] Moved code from Ideal to SuperWord --- src/hotspot/share/opto/node.hpp | 6 ++ src/hotspot/share/opto/superword.cpp | 135 ++++++++++++++++++++++++++ src/hotspot/share/opto/superword.hpp | 2 + src/hotspot/share/opto/vectornode.cpp | 83 ---------------- src/hotspot/share/opto/vectornode.hpp | 10 +- 5 files changed, 149 insertions(+), 87 deletions(-) diff --git a/src/hotspot/share/opto/node.hpp b/src/hotspot/share/opto/node.hpp index 107654752bfa7..4812b3d709ab5 100644 --- a/src/hotspot/share/opto/node.hpp +++ b/src/hotspot/share/opto/node.hpp @@ -151,6 +151,7 @@ class Pipeline; class PopulateIndexNode; class ProjNode; class RangeCheckNode; +class ReductionNode; class RegMask; class RegionNode; class RootNode; @@ -164,6 +165,7 @@ class SubTypeCheckNode; class Type; class TypeNode; class UnlockNode; +class UnorderedReductionNode; class VectorNode; class LoadVectorNode; class LoadVectorMaskedNode; @@ -712,6 +714,8 @@ class Node { DEFINE_CLASS_ID(CompressV, Vector, 4) DEFINE_CLASS_ID(ExpandV, Vector, 5) DEFINE_CLASS_ID(CompressM, Vector, 6) + DEFINE_CLASS_ID(Reduction, Vector, 7) + DEFINE_CLASS_ID(UnorderedReduction, Reduction, 0) DEFINE_CLASS_ID(Con, Type, 8) DEFINE_CLASS_ID(ConI, Con, 0) @@ -935,6 +939,7 @@ class Node { DEFINE_CLASS_QUERY(PCTable) DEFINE_CLASS_QUERY(Phi) DEFINE_CLASS_QUERY(Proj) + DEFINE_CLASS_QUERY(Reduction) DEFINE_CLASS_QUERY(Region) DEFINE_CLASS_QUERY(Root) DEFINE_CLASS_QUERY(SafePoint) @@ -944,6 +949,7 @@ class Node { DEFINE_CLASS_QUERY(Sub) DEFINE_CLASS_QUERY(SubTypeCheck) DEFINE_CLASS_QUERY(Type) + DEFINE_CLASS_QUERY(UnorderedReduction) DEFINE_CLASS_QUERY(Vector) DEFINE_CLASS_QUERY(VectorMaskCmp) DEFINE_CLASS_QUERY(VectorUnbox) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index baf880aac2016..9236504964914 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -36,6 +36,7 @@ #include "opto/mulnode.hpp" #include "opto/opcodes.hpp" #include "opto/opaquenode.hpp" +#include "opto/rootnode.hpp" #include "opto/superword.hpp" #include "opto/vectornode.hpp" #include "opto/movenode.hpp" @@ -2537,6 +2538,8 @@ bool SuperWord::output() { } } + Node_List unordered_reductions; + for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); Node_List* p = my_pack(n); @@ -2664,6 +2667,9 @@ bool SuperWord::output() { if (node_isa_reduction) { const Type *arith_type = n->bottom_type(); vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type()); + if (vn->is_UnorderedReduction()) { + unordered_reductions.push(vn); + } if (in2->is_Load()) { vlen_in_bytes = in2->as_LoadVector()->memory_size(); } else { @@ -2877,10 +2883,139 @@ bool SuperWord::output() { if (do_reserve_copy()) { make_reversable.use_new(); } + + for (uint i = 0; i < unordered_reductions.size(); i++) { + UnorderedReductionNode* ur = unordered_reductions.at(i)->as_UnorderedReduction(); + move_unordered_reduction_out_of_loop(ur); + } + NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);}) return true; } +// Having a ReductionNode in the loop is expensive. It needs to recursively +// fold together the vector values, for every vectorized loop iteration. If +// we encounter the following pattern, we can move the UnorderedReduction +// outside the loop. +// +// CountedLoop init +// | | +// +------+ | +---------------+ +// | | | | +// PhiNode (s) Vector | +// | | | +// UnorderedReduction | +// | | +// +-------------+ +// +// We patch the graph to look like this: +// +// CountedLoop neutral_vector +// | | +// +-------+ | +---------------+ +// | | | | +// PhiNode (v) Vector | +// | | | +// init VectorAccumulator | +// | | | | +// UnorderedReduction +-----------+ +// +// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we +// use a vector_accumulator, which does the same reduction, but only element +// wise. This is a single operation, rather than many for the ReductionNode. +// We can then reduce that vector_accumulator after the loop, and also reduce +// the init value into it. +// We can not do this with all reductions. Some reductions do not allow the +// reordering of operations (for example float addition). +void SuperWord::move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur) { + Compile* C = _phase->C; + CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); + Node* ctrl = ur->in(0); + Node* in1 = ur->in(1); + Node* in2 = ur->in(2); + + // Expect no ctrl for ur + if (ctrl != nullptr) { + return; + } + + // Expect data loop over backedge of Phi in cl + if (in1 == nullptr || !in1->is_Phi() || in1->in(2) != ur || in1->outcnt() != 1 || + in1->in(0) != cl) { + return; + } + + // Expect all uses to be outside the loop, except Phi + PhiNode* phi = in1->as_Phi(); + for (DUIterator_Fast kmax, k = ur->fast_outs(kmax); k < kmax; k++) { + Node* use = ur->fast_out(k); + if (use != phi && _phase->ctrl_or_self(use) == cl) { + return; + } + } + + // Expect input vector inside loop + if (!in2->is_Vector() || _phase->get_ctrl(in2) != cl) { + return; + } + VectorNode* vector = in2->as_Vector(); + + // Determine types + BasicType bt = ur->vect_type()->element_basic_type(); + const Type* bt_t = Type::get_const_basic_type(bt); + + // Create vector of neutral elements (zero for add, one for mul, etc) + Node* neutral_scalar = ReductionNode::make_reduction_input_from_vector_opc(_igvn, ur->Opcode(), bt); + _phase->set_ctrl(neutral_scalar, C->root()); + VectorNode* neutral_vector = VectorNode::scalar2vector(neutral_scalar, vector->length(), bt_t); + _igvn.register_new_node_with_optimizer(neutral_vector); + _phase->set_ctrl(neutral_vector, C->root()); + const TypeVect* vec_t = neutral_vector->vect_type(); + + // Build vector Phi + Node* vector_phi = new PhiNode(cl, vec_t); + _igvn.register_new_node_with_optimizer(vector_phi); + C->copy_node_notes_to(vector_phi, phi); + _phase->set_ctrl(vector_phi, cl); + + // Start loop with neutral element + vector_phi->set_req(1, neutral_vector); + + // In each iteration, do vector accumulation + VectorNode* vector_accumulator = ur->make_normal_vector_op(vector_phi, vector, vec_t); + _igvn.register_new_node_with_optimizer(vector_accumulator); + C->copy_node_notes_to(vector_accumulator, ur); + _phase->set_ctrl(vector_accumulator, cl); + + // And feed that into the vector Phi for the next iteration + vector_phi->set_req(2, vector_accumulator); + + // After the loop, we can reduce the init and vector_accumulator + Node* init = phi->in(1); + ur->set_req_X(1, init, &_igvn); + ur->set_req_X(2, vector_accumulator, &_igvn); + + // Cut output to old Phi, so that we only have outputs outside the loop + phi->set_req_X(2, C->top(), &_igvn); + + // Update control to outside the loop + Node* new_ctrl = _phase->get_late_ctrl(ur, cl); + _phase->set_ctrl(ur, new_ctrl); + assert(new_ctrl != nullptr && new_ctrl != cl, "new control of ur must be outside loop"); + assert(phi->outcnt() == 0, "scalar phi is unused"); + +#ifdef ASSERT + if (TraceNewVectors) { + tty->print("new Vector node: "); + neutral_vector->dump(); + tty->print("new Vector node: "); + vector_phi->dump(); + tty->print("new Vector node: "); + vector_accumulator->dump(); + } +#endif +} + //-------------------------create_post_loop_vmask------------------------- // Check the post loop vectorizability and create a vector mask if yes. // Return null to bail out if post loop is not vectorizable. diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 6d24e528be5ee..766674a540b11 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -557,6 +557,8 @@ class SuperWord : public ResourceObj { // Convert packs into vector node operations bool output(); + // Move UnorderedReduction out of loop if possible + void move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur); // Create vector mask for post loop vectorization Node* create_post_loop_vmask(); // Create a vector operand for the nodes in pack p for operand: in(opd_idx) diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index e46de72a9a7f3..0ea1b2f8373f7 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1299,89 +1299,6 @@ Node* ReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) { return nullptr; } -Node* UnorderedReductionNode::Ideal(PhaseGVN* phase, bool can_reshape) { - Node* n = ReductionNode::Ideal(phase, can_reshape); - if (n != nullptr) { - return n; - } - if (can_reshape) { - // Having a ReductionNode in the loop is expensive. It needs to recursively - // fold together the vector values, for every vectorized loop iteration. If - // we encounter the following pattern, we can move the UnorderedReduction - // outside the loop. - // - // CountedLoop init - // | | - // +------+ | +---------------+ - // | | | | - // PhiNode (s) Vector | - // | | | - // UnorderedReduction | - // | | - // +-------------+ - // - // We patch the graph to look like this: - // - // CountedLoop neutral_vector - // | | - // +-------+ | +---------------+ - // | | | | - // PhiNode (v) Vector | - // | | | - // init VectorAccumulator | - // | | | | - // UnorderedReduction +-----------+ - // - // We turned the scalar (s) Phi into a vectorized one (v). In the loop, we - // use a vector_accumulator, which does the same reduction, but only element - // wise. This is a single operation, rather than many for the ReductionNode. - // We can then reduce that vector_accumulator after the loop, and also reduce - // the init value into it. - // We can not do this with all reductions. Some reductions do not allow the - // reordering of operations (for example float addition). - Node* ctrl = in(0); - Node* in1 = in(1); - Node* in2 = in(2); - if (ctrl == nullptr && - in1 != nullptr && in1->is_Phi() && in1->in(2) == this && in1->outcnt() == 1 && - in1->in(0)->is_CountedLoop() && - in2->is_Vector()) { - // Find the relevant old nodes - VectorNode* vector = in2->as_Vector(); - PhiNode* phi = in1->as_Phi(); - CountedLoopNode* loop = phi->in(0)->as_CountedLoop(); - Node* init = phi->in(1); - // Determine types - BasicType bt = vect_type()->element_basic_type(); - const Type* bt_t = Type::get_const_basic_type(bt); - // Create vector of neutral elements (zero for add, one for mul, etc) - Node* neutral_scalar = ReductionNode::make_reduction_input_from_vector_opc(*phase, Opcode(), bt); - neutral_scalar = phase->transform(neutral_scalar); - Node* neutral_vector = VectorNode::scalar2vector(neutral_scalar, vector->length(), bt_t); - const TypeVect* vec_t = neutral_vector->as_Vector()->vect_type(); - neutral_vector = phase->transform(neutral_vector); - // Build vector Phi - Node* phi_vector = new PhiNode(loop, vec_t); - phase->set_type(phi_vector, vec_t); - // Start loop with neutral element - phi_vector->set_req(1, neutral_vector); - // In each iteration, do vector accumulation - Node* vector_accumulator = make_normal_vector_op(phi_vector, vector, vec_t); - phase->set_type(vector_accumulator, vec_t); - vector_accumulator = phase->transform(vector_accumulator); - // And feed that into the vector Phi for the next iteration - phi_vector->set_req(2, vector_accumulator); - phi_vector = phase->transform(phi_vector); - // After the loop, we can reduce the init and vector_accumulator - set_req_X(1, init, phase); - set_req_X(2, vector_accumulator, phase); - assert(phi->outcnt() == 0, "scalar phi is unused"); - return this; - } - } - return nullptr; -} - Node* VectorLoadMaskNode::Identity(PhaseGVN* phase) { BasicType out_bt = type()->is_vect()->element_basic_type(); if (!Matcher::has_predicated_vectors() && out_bt == T_BOOLEAN) { diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index da5faeb0bcf69..004f133f37cb9 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -191,7 +191,9 @@ class ReductionNode : public Node { public: ReductionNode(Node *ctrl, Node* in1, Node* in2) : Node(ctrl, in1, in2), _bottom_type(Type::get_const_basic_type(in1->bottom_type()->basic_type())), - _vect_type(in2->bottom_type()->is_vect()) {} + _vect_type(in2->bottom_type()->is_vect()) { + init_class_id(Class_Reduction); + } static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt); static int opcode(int opc, BasicType bt); @@ -223,9 +225,9 @@ class ReductionNode : public Node { // Order of reduction does not matter. Example int add. Not true for float add. class UnorderedReductionNode : public ReductionNode { public: - UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) {} - // Reduction loops can move this more expensive node outside the loop. - virtual Node* Ideal(PhaseGVN* phase, bool can_reshape); + UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) { + init_class_id(Class_UnorderedReduction); + } virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) = 0; }; From 9b01aea1717ec3bed4168741fd42a0e6d5aa976c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 5 Apr 2023 13:45:49 +0200 Subject: [PATCH 08/19] neutral -> identity element --- src/hotspot/share/opto/superword.cpp | 22 ++++++++++----------- src/hotspot/share/opto/vectorIntrinsics.cpp | 2 +- src/hotspot/share/opto/vectornode.cpp | 6 +++--- src/hotspot/share/opto/vectornode.hpp | 7 +++---- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 9236504964914..4652d66798bd1 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2910,7 +2910,7 @@ bool SuperWord::output() { // // We patch the graph to look like this: // -// CountedLoop neutral_vector +// CountedLoop identity_vector // | | // +-------+ | +---------------+ // | | | | @@ -2964,13 +2964,13 @@ void SuperWord::move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur) BasicType bt = ur->vect_type()->element_basic_type(); const Type* bt_t = Type::get_const_basic_type(bt); - // Create vector of neutral elements (zero for add, one for mul, etc) - Node* neutral_scalar = ReductionNode::make_reduction_input_from_vector_opc(_igvn, ur->Opcode(), bt); - _phase->set_ctrl(neutral_scalar, C->root()); - VectorNode* neutral_vector = VectorNode::scalar2vector(neutral_scalar, vector->length(), bt_t); - _igvn.register_new_node_with_optimizer(neutral_vector); - _phase->set_ctrl(neutral_vector, C->root()); - const TypeVect* vec_t = neutral_vector->vect_type(); + // Create vector of identity elements (zero for add, one for mul, etc) + Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, ur->Opcode(), bt); + _phase->set_ctrl(identity_scalar, C->root()); + VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector->length(), bt_t); + _igvn.register_new_node_with_optimizer(identity_vector); + _phase->set_ctrl(identity_vector, C->root()); + const TypeVect* vec_t = identity_vector->vect_type(); // Build vector Phi Node* vector_phi = new PhiNode(cl, vec_t); @@ -2978,8 +2978,8 @@ void SuperWord::move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur) C->copy_node_notes_to(vector_phi, phi); _phase->set_ctrl(vector_phi, cl); - // Start loop with neutral element - vector_phi->set_req(1, neutral_vector); + // Start loop with identity element + vector_phi->set_req(1, identity_vector); // In each iteration, do vector accumulation VectorNode* vector_accumulator = ur->make_normal_vector_op(vector_phi, vector, vec_t); @@ -3007,7 +3007,7 @@ void SuperWord::move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur) #ifdef ASSERT if (TraceNewVectors) { tty->print("new Vector node: "); - neutral_vector->dump(); + identity_vector->dump(); tty->print("new Vector node: "); vector_phi->dump(); tty->print("new Vector node: "); diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index cc23c7d0c8cab..c16dbfa5470cb 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1702,7 +1702,7 @@ bool LibraryCallKit::inline_vector_reduction() { } } - Node* init = ReductionNode::make_reduction_input_from_scalar_opc(gvn(), opc, elem_bt); + Node* init = ReductionNode::make_identity_input_for_reduction_from_scalar_opc(gvn(), opc, elem_bt); Node* value = nullptr; if (mask == nullptr) { assert(!is_masked_op, "Masked op needs the mask value never null"); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 0ea1b2f8373f7..f384182dc5211 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1398,15 +1398,15 @@ Node* VectorCastNode::Identity(PhaseGVN* phase) { } // Input opc of pre-reduction operation, eg AddI for AddReductionVI -Node* ReductionNode::make_reduction_input_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt) { +Node* ReductionNode::make_identity_input_for_reduction_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt) { int vopc = opcode(opc, bt); guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]); - return make_reduction_input_from_vector_opc(gvn, vopc, bt); + return make_identity_input_for_reduction_from_vector_opc(gvn, vopc, bt); } // Input opc of vector reduction, eg. AddReductionVI -Node* ReductionNode::make_reduction_input_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt) { +Node* ReductionNode::make_identity_input_for_reduction_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt) { switch (vopc) { case Op_AndReductionV: switch (bt) { diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 004f133f37cb9..df59ffaa730c4 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -198,10 +198,9 @@ class ReductionNode : public Node { static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); - // Input opc of pre-reduction operation, eg. AddI for AddReductionVI - static Node* make_reduction_input_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt); - // Input vopc of vector reduction, eg. AddReductionVI - static Node* make_reduction_input_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt); + // Make an identity element (zero for add, one for mul, etc) for opc of scalar/vector reduction. + static Node* make_identity_input_for_reduction_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt); + static Node* make_identity_input_for_reduction_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt); virtual const Type* bottom_type() const { return _bottom_type; From 3130bf331d3c1e24a28f6677466054b7b1a30fcd Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 24 Apr 2023 16:36:31 +0200 Subject: [PATCH 09/19] Moved move_unordered_reduction_out_of_loop from SuperWord to PhaseIdealLoop --- src/hotspot/share/opto/loopnode.cpp | 10 ++ src/hotspot/share/opto/loopnode.hpp | 3 + src/hotspot/share/opto/loopopts.cpp | 117 +++++++++++++++++++++++ src/hotspot/share/opto/superword.cpp | 133 --------------------------- src/hotspot/share/opto/superword.hpp | 2 - 5 files changed, 130 insertions(+), 135 deletions(-) diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index e2c009331dfc9..25da52aafacf6 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -4629,6 +4629,16 @@ void PhaseIdealLoop::build_and_optimize() { } } } + + // Move UnorderedReduction out of counted loop. Can be introduced by SuperWord. + if (C->has_loops()) { + for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { + IdealLoopTree* lpt = iter.current(); + if (lpt->_head->is_CountedLoop()) { + move_unordered_reduction_out_of_loop(lpt); + } + } + } } #ifndef PRODUCT diff --git a/src/hotspot/share/opto/loopnode.hpp b/src/hotspot/share/opto/loopnode.hpp index 37ae6fb9d0554..94f49c9853543 100644 --- a/src/hotspot/share/opto/loopnode.hpp +++ b/src/hotspot/share/opto/loopnode.hpp @@ -1487,6 +1487,9 @@ class PhaseIdealLoop : public PhaseTransform { bool partial_peel( IdealLoopTree *loop, Node_List &old_new ); bool duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old_new); + // Move UnorderedReduction out of loop if possible + void move_unordered_reduction_out_of_loop(IdealLoopTree* loop); + // Create a scheduled list of nodes control dependent on ctrl set. void scheduled_nodelist( IdealLoopTree *loop, VectorSet& ctrl, Node_List &sched ); // Has a use in the vector set diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index d2221a84fa295..896af0e9c3d30 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -41,6 +41,7 @@ #include "opto/rootnode.hpp" #include "opto/subnode.hpp" #include "opto/subtypenode.hpp" +#include "opto/vectornode.hpp" #include "utilities/macros.hpp" //============================================================================= @@ -4126,3 +4127,119 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old return true; } + +// Having a ReductionNode in the loop is expensive. It needs to recursively +// fold together the vector values, for every vectorized loop iteration. If +// we encounter the following pattern, we can move the UnorderedReduction +// outside the loop. +// +// CountedLoop init +// | | +// +------+ | +---------------+ +// | | | | +// PhiNode (s) Vector | +// | | | +// UnorderedReduction | +// | | +// +-------------+ +// +// We patch the graph to look like this: +// +// CountedLoop identity_vector +// | | +// +-------+ | +---------------+ +// | | | | +// PhiNode (v) Vector | +// | | | +// init VectorAccumulator | +// | | | | +// UnorderedReduction +-----------+ +// +// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we +// use a vector_accumulator, which does the same reduction, but only element +// wise. This is a single operation, rather than many for the ReductionNode. +// We can then reduce that vector_accumulator after the loop, and also reduce +// the init value into it. +// We can not do this with all reductions. Some reductions do not allow the +// reordering of operations (for example float addition). +void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { + assert(loop->_head->is_CountedLoop(), "sanity"); + + // Find all Phi nodes with UnorderedReduction on backedge. + CountedLoopNode* cl = loop->_head->as_CountedLoop(); + for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { + Node* phi = cl->fast_out(j); + if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) { + continue; + } + + UnorderedReductionNode* ur = phi->in(2)->as_UnorderedReduction(); + // For ur, we expect: no ctrl, phi as scalar input, + // and a vector input from within the loop. + if (ur->in(0) != nullptr || + ur->in(1) != phi || + !ur->in(2)->is_Vector() || + get_ctrl(ur->in(2)) != cl) { + assert(ur->in(1) == nullptr || !ur->in(1)->is_UnorderedReduction(), + "missed reduction optimization: chain of UnorderedReduction"); + continue; + } + VectorNode* vector = ur->in(2)->as_Vector(); + + // Expect all uses to be outside the loop, except Phi + for (DUIterator_Fast kmax, k = ur->fast_outs(kmax); k < kmax; k++) { + Node* use = ur->fast_out(k); + if (use != phi && ctrl_or_self(use) == cl) { + return; + } + } + + // Determine types + BasicType bt = ur->vect_type()->element_basic_type(); + const Type* bt_t = Type::get_const_basic_type(bt); + + // Create vector of identity elements (zero for add, one for mul, etc) + Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, ur->Opcode(), bt); + set_ctrl(identity_scalar, C->root()); + VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector->length(), bt_t); + _igvn.register_new_node_with_optimizer(identity_vector); + set_ctrl(identity_vector, C->root()); + const TypeVect* vec_t = identity_vector->vect_type(); + + // In each iteration, do vector accumulation + VectorNode* vector_accumulator = ur->make_normal_vector_op(phi, vector, vec_t); + _igvn.register_new_node_with_optimizer(vector_accumulator); + C->copy_node_notes_to(vector_accumulator, ur); + set_ctrl(vector_accumulator, cl); + + // After the loop, we can reduce the init and vector_accumulator + Node* init = phi->in(1); + _igvn.rehash_node_delayed(ur); + ur->set_req_X(1, init, &_igvn); + ur->set_req_X(2, vector_accumulator, &_igvn); + + // Turn the scalar phi into a vector phi + _igvn.rehash_node_delayed(phi); + phi->set_req_X(1, identity_vector, &_igvn); + phi->set_req_X(2, vector_accumulator, &_igvn); + phi->as_Type()->set_type(vec_t); + _igvn.set_type(phi, vec_t); + assert(phi->unique_out() == vector_accumulator, "accumulator is only use of phi"); + + // Update control to outside the loop + Node* new_ctrl = get_late_ctrl(ur, cl); + set_ctrl(ur, new_ctrl); + assert(new_ctrl != nullptr && new_ctrl != cl, "new control of ur must be outside loop"); + +#ifdef ASSERT + if (TraceNewVectors) { + tty->print("new Vector node: "); + identity_vector->dump(); + tty->print("new Vector node: "); + phi->dump(); + tty->print("new Vector node: "); + vector_accumulator->dump(); + } +#endif + } +} diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index 4652d66798bd1..db94ba5a61827 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -2538,8 +2538,6 @@ bool SuperWord::output() { } } - Node_List unordered_reductions; - for (int i = 0; i < _block.length(); i++) { Node* n = _block.at(i); Node_List* p = my_pack(n); @@ -2667,9 +2665,6 @@ bool SuperWord::output() { if (node_isa_reduction) { const Type *arith_type = n->bottom_type(); vn = ReductionNode::make(opc, nullptr, in1, in2, arith_type->basic_type()); - if (vn->is_UnorderedReduction()) { - unordered_reductions.push(vn); - } if (in2->is_Load()) { vlen_in_bytes = in2->as_LoadVector()->memory_size(); } else { @@ -2884,138 +2879,10 @@ bool SuperWord::output() { make_reversable.use_new(); } - for (uint i = 0; i < unordered_reductions.size(); i++) { - UnorderedReductionNode* ur = unordered_reductions.at(i)->as_UnorderedReduction(); - move_unordered_reduction_out_of_loop(ur); - } - NOT_PRODUCT(if(is_trace_loop_reverse()) {tty->print_cr("\n Final loop after SuperWord"); print_loop(true);}) return true; } -// Having a ReductionNode in the loop is expensive. It needs to recursively -// fold together the vector values, for every vectorized loop iteration. If -// we encounter the following pattern, we can move the UnorderedReduction -// outside the loop. -// -// CountedLoop init -// | | -// +------+ | +---------------+ -// | | | | -// PhiNode (s) Vector | -// | | | -// UnorderedReduction | -// | | -// +-------------+ -// -// We patch the graph to look like this: -// -// CountedLoop identity_vector -// | | -// +-------+ | +---------------+ -// | | | | -// PhiNode (v) Vector | -// | | | -// init VectorAccumulator | -// | | | | -// UnorderedReduction +-----------+ -// -// We turned the scalar (s) Phi into a vectorized one (v). In the loop, we -// use a vector_accumulator, which does the same reduction, but only element -// wise. This is a single operation, rather than many for the ReductionNode. -// We can then reduce that vector_accumulator after the loop, and also reduce -// the init value into it. -// We can not do this with all reductions. Some reductions do not allow the -// reordering of operations (for example float addition). -void SuperWord::move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur) { - Compile* C = _phase->C; - CountedLoopNode *cl = lpt()->_head->as_CountedLoop(); - Node* ctrl = ur->in(0); - Node* in1 = ur->in(1); - Node* in2 = ur->in(2); - - // Expect no ctrl for ur - if (ctrl != nullptr) { - return; - } - - // Expect data loop over backedge of Phi in cl - if (in1 == nullptr || !in1->is_Phi() || in1->in(2) != ur || in1->outcnt() != 1 || - in1->in(0) != cl) { - return; - } - - // Expect all uses to be outside the loop, except Phi - PhiNode* phi = in1->as_Phi(); - for (DUIterator_Fast kmax, k = ur->fast_outs(kmax); k < kmax; k++) { - Node* use = ur->fast_out(k); - if (use != phi && _phase->ctrl_or_self(use) == cl) { - return; - } - } - - // Expect input vector inside loop - if (!in2->is_Vector() || _phase->get_ctrl(in2) != cl) { - return; - } - VectorNode* vector = in2->as_Vector(); - - // Determine types - BasicType bt = ur->vect_type()->element_basic_type(); - const Type* bt_t = Type::get_const_basic_type(bt); - - // Create vector of identity elements (zero for add, one for mul, etc) - Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, ur->Opcode(), bt); - _phase->set_ctrl(identity_scalar, C->root()); - VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector->length(), bt_t); - _igvn.register_new_node_with_optimizer(identity_vector); - _phase->set_ctrl(identity_vector, C->root()); - const TypeVect* vec_t = identity_vector->vect_type(); - - // Build vector Phi - Node* vector_phi = new PhiNode(cl, vec_t); - _igvn.register_new_node_with_optimizer(vector_phi); - C->copy_node_notes_to(vector_phi, phi); - _phase->set_ctrl(vector_phi, cl); - - // Start loop with identity element - vector_phi->set_req(1, identity_vector); - - // In each iteration, do vector accumulation - VectorNode* vector_accumulator = ur->make_normal_vector_op(vector_phi, vector, vec_t); - _igvn.register_new_node_with_optimizer(vector_accumulator); - C->copy_node_notes_to(vector_accumulator, ur); - _phase->set_ctrl(vector_accumulator, cl); - - // And feed that into the vector Phi for the next iteration - vector_phi->set_req(2, vector_accumulator); - - // After the loop, we can reduce the init and vector_accumulator - Node* init = phi->in(1); - ur->set_req_X(1, init, &_igvn); - ur->set_req_X(2, vector_accumulator, &_igvn); - - // Cut output to old Phi, so that we only have outputs outside the loop - phi->set_req_X(2, C->top(), &_igvn); - - // Update control to outside the loop - Node* new_ctrl = _phase->get_late_ctrl(ur, cl); - _phase->set_ctrl(ur, new_ctrl); - assert(new_ctrl != nullptr && new_ctrl != cl, "new control of ur must be outside loop"); - assert(phi->outcnt() == 0, "scalar phi is unused"); - -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - identity_vector->dump(); - tty->print("new Vector node: "); - vector_phi->dump(); - tty->print("new Vector node: "); - vector_accumulator->dump(); - } -#endif -} - //-------------------------create_post_loop_vmask------------------------- // Check the post loop vectorizability and create a vector mask if yes. // Return null to bail out if post loop is not vectorizable. diff --git a/src/hotspot/share/opto/superword.hpp b/src/hotspot/share/opto/superword.hpp index 766674a540b11..6d24e528be5ee 100644 --- a/src/hotspot/share/opto/superword.hpp +++ b/src/hotspot/share/opto/superword.hpp @@ -557,8 +557,6 @@ class SuperWord : public ResourceObj { // Convert packs into vector node operations bool output(); - // Move UnorderedReduction out of loop if possible - void move_unordered_reduction_out_of_loop(UnorderedReductionNode* ur); // Create vector mask for post loop vectorization Node* create_post_loop_vmask(); // Create a vector operand for the nodes in pack p for operand: in(opd_idx) From 02546bd2a50f5fb682112776c59d6ea9d2d5fd4e Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Thu, 27 Apr 2023 10:42:18 +0200 Subject: [PATCH 10/19] generalized the algorithm to handle a chain of UnorderedReductions --- src/hotspot/share/opto/loopnode.cpp | 2 +- src/hotspot/share/opto/loopopts.cpp | 201 ++++++++++++++++++++-------- 2 files changed, 145 insertions(+), 58 deletions(-) diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index 25da52aafacf6..b0da516f61474 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -4631,7 +4631,7 @@ void PhaseIdealLoop::build_and_optimize() { } // Move UnorderedReduction out of counted loop. Can be introduced by SuperWord. - if (C->has_loops()) { + if (C->has_loops() && !C->major_progress()) { for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { IdealLoopTree* lpt = iter.current(); if (lpt->_head->is_CountedLoop()) { diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 896af0e9c3d30..38f59946001e8 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4128,20 +4128,26 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old return true; } -// Having a ReductionNode in the loop is expensive. It needs to recursively +// Having ReductionNodes in the loop is expensive. They need to recursively // fold together the vector values, for every vectorized loop iteration. If -// we encounter the following pattern, we can move the UnorderedReduction -// outside the loop. +// we encounter the following pattern, we can vector accumulate the values +// inside the loop, and only have a single UnorderedReduction after the loop. // // CountedLoop init // | | -// +------+ | +---------------+ -// | | | | -// PhiNode (s) Vector | -// | | | -// UnorderedReduction | -// | | -// +-------------+ +// +------+ | +-----------------------+ +// | | | | +// PhiNode (s) | +// | | +// | Vector | +// | | | +// UnorderedReduction (first_ur) | +// | | +// ... Vector | +// | | | +// UnorderedReduction (last_ur) | +// | | +// +---------------------+ // // We patch the graph to look like this: // @@ -4149,96 +4155,177 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old // | | // +-------+ | +---------------+ // | | | | -// PhiNode (v) Vector | +// PhiNode (v) | +// | | +// | Vector | +// | | | +// VectorAccumulator | +// | | +// ... Vector | // | | | // init VectorAccumulator | // | | | | // UnorderedReduction +-----------+ // // We turned the scalar (s) Phi into a vectorized one (v). In the loop, we -// use a vector_accumulator, which does the same reduction, but only element -// wise. This is a single operation, rather than many for the ReductionNode. -// We can then reduce that vector_accumulator after the loop, and also reduce -// the init value into it. +// use vector_accumulators, which do the same reductions, but only element +// wise. This is a single operation per vector_accumulator, rather than many +// for a UnorderedReduction. We can then reduce the last vector_accumulator +// after the loop, and also reduce the init value into it. // We can not do this with all reductions. Some reductions do not allow the // reordering of operations (for example float addition). void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { - assert(loop->_head->is_CountedLoop(), "sanity"); + assert(!C->major_progress() && loop->_head->is_CountedLoop(), "sanity"); // Find all Phi nodes with UnorderedReduction on backedge. CountedLoopNode* cl = loop->_head->as_CountedLoop(); for (DUIterator_Fast jmax, j = cl->fast_outs(jmax); j < jmax; j++) { Node* phi = cl->fast_out(j); + // We have a phi with a single use, and a UnorderedReduction on the backedge. if (!phi->is_Phi() || phi->outcnt() != 1 || !phi->in(2)->is_UnorderedReduction()) { continue; } - UnorderedReductionNode* ur = phi->in(2)->as_UnorderedReduction(); - // For ur, we expect: no ctrl, phi as scalar input, - // and a vector input from within the loop. - if (ur->in(0) != nullptr || - ur->in(1) != phi || - !ur->in(2)->is_Vector() || - get_ctrl(ur->in(2)) != cl) { - assert(ur->in(1) == nullptr || !ur->in(1)->is_UnorderedReduction(), - "missed reduction optimization: chain of UnorderedReduction"); - continue; - } - VectorNode* vector = ur->in(2)->as_Vector(); + UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction(); - // Expect all uses to be outside the loop, except Phi - for (DUIterator_Fast kmax, k = ur->fast_outs(kmax); k < kmax; k++) { - Node* use = ur->fast_out(k); - if (use != phi && ctrl_or_self(use) == cl) { - return; + // Traverse up the chain of UnorderedReductions, checking that it loops back to + // the phi. Check that all UnorderedReductions only have a single use, except for + // the last (last_ur), which only has phi as a use in the loop, and all other uses + // are outside the loop. + UnorderedReductionNode* current = last_ur; + UnorderedReductionNode* first_ur = nullptr; + while (true) { + assert(current->is_UnorderedReduction(), "sanity"); + + // Expect no ctrl and a vector_input from within the loop. + Node* ctrl = current->in(0); + Node* vector_input = current->in(2); + if (ctrl != nullptr || + !vector_input->is_Vector() || + get_ctrl(vector_input) != cl) { + DEBUG_ONLY( current->dump(-1); ) + assert(false, "reduction has ctrl or bad vector_input"); + break; // Chain traversal fails. + } + + // Expect single use of UnorderedReduction, except for last_ur. + if (current == last_ur) { + // Expect all uses to be outside the loop, except phi. + for (DUIterator_Fast kmax, k = current->fast_outs(kmax); k < kmax; k++) { + Node* use = current->fast_out(k); + if (use != phi && ctrl_or_self(use) == cl) { + DEBUG_ONLY( current->dump(-1); ) + assert(false, "reduction has use inside loop"); + break; // Chain traversal fails. + } + } + } else { + if (current->outcnt() != 1) { + DEBUG_ONLY( current->dump(-1); ) + assert(false, "reduction (not last) has more than one use"); + break; // Chain traversal fails. + } } + + // Expect another UnorderedReduction or phi as the scalar input. + Node* scalar_input = current->in(1); + if (scalar_input->is_UnorderedReduction() && + scalar_input->Opcode() == current->Opcode()) { + // Move up the UnorderedReduction chain. + current = scalar_input->as_UnorderedReduction(); + } else if (scalar_input == phi) { + // Chain terminates at phi. + first_ur = current; + current = nullptr; + break; // Success. + } else { + DEBUG_ONLY( current->dump(-1); ) + assert(false, "scalar_input is neither phi nor a matchin reduction"); + break; // Chain traversal fails. + } + } + if (current != nullptr) { + // Chain traversal was not successful. + continue; } + assert(first_ur != nullptr, "must have successfully terminated chain traversal"); // Determine types - BasicType bt = ur->vect_type()->element_basic_type(); - const Type* bt_t = Type::get_const_basic_type(bt); + const TypeVect* vec_t = last_ur->vect_type(); + uint vector_length = vec_t->length(); + BasicType bt = vec_t->element_basic_type(); + const Type* bt_t = Type::get_const_basic_type(bt); // Create vector of identity elements (zero for add, one for mul, etc) - Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, ur->Opcode(), bt); + Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, last_ur->Opcode(), bt); set_ctrl(identity_scalar, C->root()); - VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector->length(), bt_t); + VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); _igvn.register_new_node_with_optimizer(identity_vector); set_ctrl(identity_vector, C->root()); - const TypeVect* vec_t = identity_vector->vect_type(); + assert(vec_t == identity_vector->vect_type(), "matching vector type"); +#ifdef ASSERT + if (TraceNewVectors) { + tty->print("new Vector node: "); + identity_vector->dump(); + } +#endif - // In each iteration, do vector accumulation - VectorNode* vector_accumulator = ur->make_normal_vector_op(phi, vector, vec_t); - _igvn.register_new_node_with_optimizer(vector_accumulator); - C->copy_node_notes_to(vector_accumulator, ur); - set_ctrl(vector_accumulator, cl); + // Traverse down the chain of UnorderedReductions, and create the vector_accumulators. + current = first_ur; + Node* last_vector_accumulator = phi; + while (true) { + // Create vector_accumulator to replace current. + Node* vector_input = current->in(2); + VectorNode* vector_accumulator = current->make_normal_vector_op(last_vector_accumulator, vector_input, vec_t); + _igvn.register_new_node_with_optimizer(vector_accumulator); + C->copy_node_notes_to(vector_accumulator, current); + set_ctrl(vector_accumulator, cl); + last_vector_accumulator = vector_accumulator; + if (current != last_ur) { + // All UnorderedReductions except the last are now useless. + _igvn.rehash_node_delayed(current); + current->set_req_X(1, C->top(), &_igvn); + current->set_req_X(2, C->top(), &_igvn); + } +#ifdef ASSERT + if (TraceNewVectors) { + tty->print("new Vector node: "); + vector_accumulator->dump(); + } +#endif + // Iterate down, until we hit last_ur. + if (current != last_ur) { + current = current->unique_out()->as_UnorderedReduction(); + } else { + current = nullptr; + break; + } + } + assert(current == nullptr, "terminated correctly"); - // After the loop, we can reduce the init and vector_accumulator + // After the loop, we can reduce the init and last_vector_accumulator. Node* init = phi->in(1); - _igvn.rehash_node_delayed(ur); - ur->set_req_X(1, init, &_igvn); - ur->set_req_X(2, vector_accumulator, &_igvn); + _igvn.rehash_node_delayed(last_ur); + last_ur->set_req_X(1, init, &_igvn); + last_ur->set_req_X(2, last_vector_accumulator, &_igvn); - // Turn the scalar phi into a vector phi + // Turn the scalar phi into a vector phi. _igvn.rehash_node_delayed(phi); phi->set_req_X(1, identity_vector, &_igvn); - phi->set_req_X(2, vector_accumulator, &_igvn); + phi->set_req_X(2, last_vector_accumulator, &_igvn); phi->as_Type()->set_type(vec_t); _igvn.set_type(phi, vec_t); - assert(phi->unique_out() == vector_accumulator, "accumulator is only use of phi"); + assert(phi->outcnt() == 1, "accumulator is only use of phi"); - // Update control to outside the loop - Node* new_ctrl = get_late_ctrl(ur, cl); - set_ctrl(ur, new_ctrl); + // Update control to outside the loop. + Node* new_ctrl = get_late_ctrl(last_ur, cl); + set_ctrl(last_ur, new_ctrl); assert(new_ctrl != nullptr && new_ctrl != cl, "new control of ur must be outside loop"); #ifdef ASSERT if (TraceNewVectors) { - tty->print("new Vector node: "); - identity_vector->dump(); tty->print("new Vector node: "); phi->dump(); - tty->print("new Vector node: "); - vector_accumulator->dump(); } #endif } From a9dbffe667bc04779b11afbfb1c54281c0d8f869 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Tue, 2 May 2023 08:51:24 +0200 Subject: [PATCH 11/19] small bug fix --- src/hotspot/share/opto/loopopts.cpp | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 38f59946001e8..21a06b9a2c2a6 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4200,10 +4200,8 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { // Expect no ctrl and a vector_input from within the loop. Node* ctrl = current->in(0); Node* vector_input = current->in(2); - if (ctrl != nullptr || - !vector_input->is_Vector() || - get_ctrl(vector_input) != cl) { - DEBUG_ONLY( current->dump(-1); ) + if (ctrl != nullptr || get_ctrl(vector_input) != cl) { + DEBUG_ONLY( current->dump(1); ) assert(false, "reduction has ctrl or bad vector_input"); break; // Chain traversal fails. } @@ -4239,7 +4237,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { current = nullptr; break; // Success. } else { - DEBUG_ONLY( current->dump(-1); ) + DEBUG_ONLY( current->dump(1); ) assert(false, "scalar_input is neither phi nor a matchin reduction"); break; // Chain traversal fails. } From 5a51ac37f001a248fe9a958b9fc0af7cc18a4d21 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 8 May 2023 08:48:38 +0200 Subject: [PATCH 12/19] copy node notes with igvn registering --- src/hotspot/share/opto/loopopts.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index c0e2a659bdf09..77c6517ea13e1 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4269,8 +4269,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { // Create vector_accumulator to replace current. Node* vector_input = current->in(2); VectorNode* vector_accumulator = current->make_normal_vector_op(last_vector_accumulator, vector_input, vec_t); - _igvn.register_new_node_with_optimizer(vector_accumulator); - C->copy_node_notes_to(vector_accumulator, current); + _igvn.register_new_node_with_optimizer(vector_accumulator, current); set_ctrl(vector_accumulator, cl); last_vector_accumulator = vector_accumulator; if (current != last_ur) { From 56990bdeae564f75cd2221b77fcc581f8cba7628 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 10 May 2023 06:52:27 +0200 Subject: [PATCH 13/19] Address review suggestion by @pfustc --- src/hotspot/share/opto/loopopts.cpp | 73 +++++++++++---------------- src/hotspot/share/opto/vectornode.cpp | 6 ++- src/hotspot/share/opto/vectornode.hpp | 3 +- 3 files changed, 37 insertions(+), 45 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 77c6517ea13e1..60947b012b9ec 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4252,8 +4252,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, last_ur->Opcode(), bt); set_ctrl(identity_scalar, C->root()); VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); - _igvn.register_new_node_with_optimizer(identity_vector); - set_ctrl(identity_vector, C->root()); + register_new_node(identity_vector, C->root()); assert(vec_t == identity_vector->vect_type(), "matching vector type"); #ifdef ASSERT if (TraceNewVectors) { @@ -4262,62 +4261,50 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } #endif - // Traverse down the chain of UnorderedReductions, and create the vector_accumulators. + // Turn the scalar phi into a vector phi. + _igvn.rehash_node_delayed(phi); + Node* init = phi->in(1); // Remember init before replacing it. + phi->set_req_X(1, identity_vector, &_igvn); + phi->as_Type()->set_type(vec_t); + _igvn.set_type(phi, vec_t); + + // Traverse down the chain of UnorderedReductions, and replace them with vector_accumulators. current = first_ur; - Node* last_vector_accumulator = phi; while (true) { // Create vector_accumulator to replace current. - Node* vector_input = current->in(2); + Node* last_vector_accumulator = current->in(1); + Node* vector_input = current->in(2); VectorNode* vector_accumulator = current->make_normal_vector_op(last_vector_accumulator, vector_input, vec_t); - _igvn.register_new_node_with_optimizer(vector_accumulator, current); - set_ctrl(vector_accumulator, cl); - last_vector_accumulator = vector_accumulator; - if (current != last_ur) { - // All UnorderedReductions except the last are now useless. - _igvn.rehash_node_delayed(current); - current->set_req_X(1, C->top(), &_igvn); - current->set_req_X(2, C->top(), &_igvn); - } + register_new_node(vector_accumulator, cl); + _igvn.replace_node(current, vector_accumulator); #ifdef ASSERT if (TraceNewVectors) { tty->print("new Vector node: "); vector_accumulator->dump(); } #endif - // Iterate down, until we hit last_ur. - if (current != last_ur) { - current = current->unique_out()->as_UnorderedReduction(); - } else { - current = nullptr; + if (current == last_ur) { break; } + current = vector_accumulator->unique_out()->as_UnorderedReduction(); } - assert(current == nullptr, "terminated correctly"); - - // After the loop, we can reduce the init and last_vector_accumulator. - Node* init = phi->in(1); - _igvn.rehash_node_delayed(last_ur); - last_ur->set_req_X(1, init, &_igvn); - last_ur->set_req_X(2, last_vector_accumulator, &_igvn); - // Turn the scalar phi into a vector phi. - _igvn.rehash_node_delayed(phi); - phi->set_req_X(1, identity_vector, &_igvn); - phi->set_req_X(2, last_vector_accumulator, &_igvn); - phi->as_Type()->set_type(vec_t); - _igvn.set_type(phi, vec_t); - assert(phi->outcnt() == 1, "accumulator is only use of phi"); - - // Update control to outside the loop. - Node* new_ctrl = get_late_ctrl(last_ur, cl); - set_ctrl(last_ur, new_ctrl); - assert(new_ctrl != nullptr && new_ctrl != cl, "new control of ur must be outside loop"); + // Create post-loop reduction. + Node* last_accumulator = phi->in(2); + Node* post_loop_reduction = ReductionNode::make_from_vopc(first_ur->Opcode(), nullptr, init, last_accumulator, bt); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - phi->dump(); + // Take over uses of last_accumulator that are not in the loop. + for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { + Node* use = last_accumulator->out(i); + if (ctrl_or_self(use) != cl) { + use->replace_edge(last_accumulator, post_loop_reduction, &_igvn); + --i; + } } -#endif + register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); + + assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); + assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); + assert(phi->outcnt() == 1, "accumulator is the only use of phi"); } } diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index ba42432eb0733..1784ef2a9e504 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -1265,13 +1265,17 @@ int ReductionNode::opcode(int opc, BasicType bt) { } // Return the appropriate reduction node. -ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) { +ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt) { int vopc = opcode(opc, bt); // This method should not be called for unimplemented vectors. guarantee(vopc != opc, "Vector for '%s' is not implemented", NodeClassNames[opc]); + return ReductionNode::make_from_vopc(vopc, ctrl, n1, n2, bt); +} + +ReductionNode* ReductionNode::make_from_vopc(int vopc, Node* ctrl, Node* n1, Node* n2, BasicType bt) { switch (vopc) { case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2); case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 481712d680dec..efbf21ee4c9e6 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -197,7 +197,8 @@ class ReductionNode : public Node { init_class_id(Class_Reduction); } - static ReductionNode* make(int opc, Node *ctrl, Node* in1, Node* in2, BasicType bt); + static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt); + static ReductionNode* make_from_vopc(int vopc, Node* ctrl, Node* in1, Node* in2, BasicType bt); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); // Make an identity element (zero for add, one for mul, etc) for opc of scalar/vector reduction. From 72fa58e021b130a262337d40ba17215bbbb6bbaf Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 10 May 2023 08:25:47 +0200 Subject: [PATCH 14/19] small bugfix. And put TraceNewVector in VectorNode::trace_new_vector --- src/hotspot/share/opto/compile.cpp | 7 +------ src/hotspot/share/opto/loopopts.cpp | 18 +++++------------ src/hotspot/share/opto/superword.cpp | 28 ++++----------------------- src/hotspot/share/opto/vectornode.hpp | 9 +++++++++ 4 files changed, 19 insertions(+), 43 deletions(-) diff --git a/src/hotspot/share/opto/compile.cpp b/src/hotspot/share/opto/compile.cpp index c49c6eccc467e..cac9053e3a01d 100644 --- a/src/hotspot/share/opto/compile.cpp +++ b/src/hotspot/share/opto/compile.cpp @@ -2844,12 +2844,7 @@ void Compile::process_logic_cone_root(PhaseIterGVN &igvn, Node *n, VectorSet &vi if (mask == nullptr || Matcher::match_rule_supported_vector_masked(Op_MacroLogicV, vt->length(), vt->element_basic_type())) { Node* macro_logic = xform_to_MacroLogicV(igvn, vt, partition, inputs); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - macro_logic->dump(); - } -#endif + VectorNode::trace_new_vector(macro_logic, "MacroLogic"); igvn.replace_node(n, macro_logic); } } diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 60947b012b9ec..cb6c5202aed0e 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4254,12 +4254,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); register_new_node(identity_vector, C->root()); assert(vec_t == identity_vector->vect_type(), "matching vector type"); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - identity_vector->dump(); - } -#endif + VectorNode::trace_new_vector(identity_vector, "UnorderedReduction"); // Turn the scalar phi into a vector phi. _igvn.rehash_node_delayed(phi); @@ -4277,12 +4272,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { VectorNode* vector_accumulator = current->make_normal_vector_op(last_vector_accumulator, vector_input, vec_t); register_new_node(vector_accumulator, cl); _igvn.replace_node(current, vector_accumulator); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - vector_accumulator->dump(); - } -#endif + VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction"); if (current == last_ur) { break; } @@ -4296,12 +4286,14 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { // Take over uses of last_accumulator that are not in the loop. for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { Node* use = last_accumulator->out(i); - if (ctrl_or_self(use) != cl) { + if (use != phi && use != post_loop_reduction) { + assert(ctrl_or_self(use) != cl, "use must be outside loop"); use->replace_edge(last_accumulator, post_loop_reduction, &_igvn); --i; } } register_new_node(post_loop_reduction, get_late_ctrl(post_loop_reduction, cl)); + VectorNode::trace_new_vector(post_loop_reduction, "UnorderedReduction"); assert(last_accumulator->outcnt() == 2, "last_accumulator has 2 uses: phi and post_loop_reduction"); assert(post_loop_reduction->outcnt() > 0, "should have taken over all non loop uses of last_accumulator"); diff --git a/src/hotspot/share/opto/superword.cpp b/src/hotspot/share/opto/superword.cpp index ba58270bf27f0..abbb1625a7733 100644 --- a/src/hotspot/share/opto/superword.cpp +++ b/src/hotspot/share/opto/superword.cpp @@ -3329,12 +3329,7 @@ bool SuperWord::output() { if (vlen_in_bytes > max_vlen_in_bytes) { max_vlen_in_bytes = vlen_in_bytes; } -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - vn->dump(); - } -#endif + VectorNode::trace_new_vector(vn, "SuperWord"); } }//for (int i = 0; i < _block.length(); i++) @@ -3507,12 +3502,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { assert(VectorNode::is_populate_index_supported(iv_bt), "Should support"); const TypeVect* vt = TypeVect::make(iv_bt, vlen); Node* vn = new PopulateIndexNode(iv(), _igvn.intcon(1), vt); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - vn->dump(); - } -#endif + VectorNode::trace_new_vector(vn, "SuperWord"); _igvn.register_new_node_with_optimizer(vn); _phase->set_ctrl(vn, _phase->get_ctrl(opd)); return vn; @@ -3585,12 +3575,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { _igvn.register_new_node_with_optimizer(vn); _phase->set_ctrl(vn, _phase->get_ctrl(opd)); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - vn->dump(); - } -#endif + VectorNode::trace_new_vector(vn, "SuperWord"); return vn; } @@ -3622,12 +3607,7 @@ Node* SuperWord::vector_opd(Node_List* p, int opd_idx) { } _igvn.register_new_node_with_optimizer(pk); _phase->set_ctrl(pk, _phase->get_ctrl(opd)); -#ifdef ASSERT - if (TraceNewVectors) { - tty->print("new Vector node: "); - pk->dump(); - } -#endif + VectorNode::trace_new_vector(pk, "SuperWord"); return pk; } diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index efbf21ee4c9e6..c6f023d30a101 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -132,6 +132,15 @@ class VectorNode : public TypeNode { static bool is_vector_shift_count(Node* n) { return is_vector_shift_count(n->Opcode()); } + + static void trace_new_vector(Node* n, const char* context) { +#ifdef ASSERT + if (TraceNewVectors) { + tty->print("TraceNewVectors [%s]: ", context); + n->dump(); + } +#endif + } }; //===========================Vector=ALU=Operations============================= From 31d977c21f7a2b62fb8123bc7967731aa961e373 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 10 May 2023 13:40:42 +0200 Subject: [PATCH 15/19] use is_counted and is_innermost --- src/hotspot/share/opto/loopnode.cpp | 2 +- src/hotspot/share/opto/loopopts.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hotspot/share/opto/loopnode.cpp b/src/hotspot/share/opto/loopnode.cpp index 1c4278124536b..7885f5ea1527c 100644 --- a/src/hotspot/share/opto/loopnode.cpp +++ b/src/hotspot/share/opto/loopnode.cpp @@ -4633,7 +4633,7 @@ void PhaseIdealLoop::build_and_optimize() { if (C->has_loops() && !C->major_progress()) { for (LoopTreeIterator iter(_ltree_root); !iter.done(); iter.next()) { IdealLoopTree* lpt = iter.current(); - if (lpt->_head->is_CountedLoop()) { + if (lpt->is_counted() && lpt->is_innermost()) { move_unordered_reduction_out_of_loop(lpt); } } diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index cb6c5202aed0e..5bcf46e8eaec0 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4169,7 +4169,7 @@ bool PhaseIdealLoop::duplicate_loop_backedge(IdealLoopTree *loop, Node_List &old // We can not do this with all reductions. Some reductions do not allow the // reordering of operations (for example float addition). void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { - assert(!C->major_progress() && loop->_head->is_CountedLoop(), "sanity"); + assert(!C->major_progress() && loop->is_counted() && loop->is_innermost(), "sanity"); // Find all Phi nodes with UnorderedReduction on backedge. CountedLoopNode* cl = loop->_head->as_CountedLoop(); From 0a72f4c405c83ebb5f40acb5abb3cbadf3df31cc Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 15 May 2023 09:39:08 +0200 Subject: [PATCH 16/19] Added Matcher::match_rule_supported_vector check, removed bad assert, added test for it --- src/hotspot/share/opto/loopopts.cpp | 20 ++- src/hotspot/share/opto/vectornode.hpp | 28 ++++ .../superword/TestUnorderedReduction.java | 149 ++++++++++++++++++ 3 files changed, 189 insertions(+), 8 deletions(-) create mode 100644 test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 5bcf46e8eaec0..2850fd8b879dd 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4182,6 +4182,18 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { UnorderedReductionNode* last_ur = phi->in(2)->as_UnorderedReduction(); + // Determine types + const TypeVect* vec_t = last_ur->vect_type(); + uint vector_length = vec_t->length(); + BasicType bt = vec_t->element_basic_type(); + const Type* bt_t = Type::get_const_basic_type(bt); + + if (!last_ur->make_normal_vector_op_implemented(vec_t)) { + DEBUG_ONLY( last_ur->dump(); ) + assert(false, "do not have normal vector op for this reduction"); + continue; // not implemented -> fails + } + // Traverse up the chain of UnorderedReductions, checking that it loops back to // the phi. Check that all UnorderedReductions only have a single use, except for // the last (last_ur), which only has phi as a use in the loop, and all other uses @@ -4213,8 +4225,6 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } } else { if (current->outcnt() != 1) { - DEBUG_ONLY( current->dump(-1); ) - assert(false, "reduction (not last) has more than one use"); break; // Chain traversal fails. } } @@ -4242,12 +4252,6 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } assert(first_ur != nullptr, "must have successfully terminated chain traversal"); - // Determine types - const TypeVect* vec_t = last_ur->vect_type(); - uint vector_length = vec_t->length(); - BasicType bt = vec_t->element_basic_type(); - const Type* bt_t = Type::get_const_basic_type(bt); - // Create vector of identity elements (zero for add, one for mul, etc) Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, last_ur->Opcode(), bt); set_ctrl(identity_scalar, C->root()); diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index c6f023d30a101..70d9af793fffb 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -241,6 +241,7 @@ class UnorderedReductionNode : public ReductionNode { } virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) = 0; + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) = 0; }; //------------------------------AddReductionVINode-------------------------------------- @@ -252,6 +253,9 @@ class AddReductionVINode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new AddVINode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_AddVI, vt->length(), vt->element_basic_type()); + } }; //------------------------------AddReductionVLNode-------------------------------------- @@ -263,6 +267,9 @@ class AddReductionVLNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new AddVLNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_AddVL, vt->length(), vt->element_basic_type()); + } }; //------------------------------AddReductionVFNode-------------------------------------- @@ -426,6 +433,9 @@ class MulReductionVINode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new MulVINode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_MulVI, vt->length(), vt->element_basic_type()); + } }; //------------------------------MulReductionVLNode-------------------------------------- @@ -437,6 +447,9 @@ class MulReductionVLNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new MulVLNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_MulVL, vt->length(), vt->element_basic_type()); + } }; //------------------------------MulReductionVFNode-------------------------------------- @@ -783,6 +796,9 @@ class AndReductionVNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new AndVNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_AndV, vt->length(), vt->element_basic_type()); + } }; //------------------------------OrVNode--------------------------------------- @@ -803,6 +819,9 @@ class OrReductionVNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new OrVNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_OrV, vt->length(), vt->element_basic_type()); + } }; //------------------------------XorVNode--------------------------------------- @@ -823,6 +842,9 @@ class XorReductionVNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new XorVNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_XorV, vt->length(), vt->element_basic_type()); + } }; //------------------------------MinReductionVNode-------------------------------------- @@ -834,6 +856,9 @@ class MinReductionVNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new MinVNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_MinV, vt->length(), vt->element_basic_type()); + } }; //------------------------------MaxReductionVNode-------------------------------------- @@ -845,6 +870,9 @@ class MaxReductionVNode : public UnorderedReductionNode { virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { return new MaxVNode(in1, in2, vt); } + virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { + return Matcher::match_rule_supported_vector(Op_MaxV, vt->length(), vt->element_basic_type()); + } }; //------------------------------CompressVNode-------------------------------------- diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java new file mode 100644 index 0000000000000..1c84750652b1e --- /dev/null +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2023, Oracle and/or its affiliates. All rights reserved. + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This code is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 only, as + * published by the Free Software Foundation. + * + * This code is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * version 2 for more details (a copy is included in the LICENSE file that + * accompanied this code). + * + * You should have received a copy of the GNU General Public License version + * 2 along with this work; if not, write to the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA + * or visit www.oracle.com if you need additional information or have any + * questions. + */ + +/** + * @test + * @bug 8302652 + * @summary Special test cases for PhaseIdealLoop::move_unordered_reduction_out_of_loop + * @library /test/lib / + * @run driver compiler.loopopts.superword.TestUnorderedReduction + */ + +package compiler.loopopts.superword; + +import compiler.lib.ir_framework.*; + +public class TestUnorderedReduction { + static final int RANGE = 1024; + static final int ITER = 10; + + public static void main(String[] args) { + TestFramework.runWithFlags("-Xbatch", + "-XX:CompileCommand=compileonly,compiler.loopopts.superword.TestUnorderedReduction::test*", + "-XX:MaxVectorSize=16"); + } + + @Run(test = {"test1", "test2"}) + @Warmup(0) + public void runTests() throws Exception { + int[] data = new int[RANGE]; + + init(data); + for (int i = 0; i < ITER; i++) { + int r1 = test1(data, i); + int r2 = ref1(data, i); + if (r1 != r2) { + throw new RuntimeException("Wrong result test1: " + r1 + " != " + r2); + } + } + + for (int i = 0; i < ITER; i++) { + int r1 = test2(data, i); + int r2 = ref2(data, i); + if (r1 != r2) { + throw new RuntimeException("Wrong result test2: " + r1 + " != " + r2); + } + } + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR, "> 0", + IRNode.ADD_VI, "= 0", + IRNode.ADD_REDUCTION_VI, "> 0"}, // count can be high + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static int test1(int[] data, int sum) { + // Vectorizes, but the UnorderedReduction cannot be moved out of the loop, + // because we have a use inside the loop. + int x = 0; + for (int i = 0; i < RANGE; i+=8) { + sum += 11 * data[i+0]; // vec 1 (16 bytes) + sum += 11 * data[i+1]; + sum += 11 * data[i+2]; + sum += 11 * data[i+3]; + x = sum + i; // vec 1 reduction has more than 1 use + sum += 11 * data[i+4]; // vec 2 (next 16 bytes) + sum += 11 * data[i+5]; + sum += 11 * data[i+6]; + sum += 11 * data[i+7]; + } + return sum + x; + } + + static int ref1(int[] data, int sum) { + int x = 0; + for (int i = 0; i < RANGE; i+=8) { + sum += 11 * data[i+0]; + sum += 11 * data[i+1]; + sum += 11 * data[i+2]; + sum += 11 * data[i+3]; + x = sum + i; + sum += 11 * data[i+4]; + sum += 11 * data[i+5]; + sum += 11 * data[i+6]; + sum += 11 * data[i+7]; + } + return sum + x; + } + + @Test + @IR(counts = {IRNode.LOAD_VECTOR, "> 0", + IRNode.ADD_VI, "> 0", + IRNode.ADD_REDUCTION_VI, "> 0", + IRNode.ADD_REDUCTION_VI, "<= 2"}, // count must be low + applyIfCPUFeatureOr = {"sse4.1", "true", "asimd", "true"}) + static int test2(int[] data, int sum) { + for (int i = 0; i < RANGE; i+=8) { + // Vectorized, and UnorderedReduction moved outside loop. + sum += 11 * data[i+0]; // vec 1 + sum += 11 * data[i+1]; + sum += 11 * data[i+2]; + sum += 11 * data[i+3]; + sum += 11 * data[i+4]; // vec 2 + sum += 11 * data[i+5]; + sum += 11 * data[i+6]; + sum += 11 * data[i+7]; + } + return sum; + } + + static int ref2(int[] data, int sum) { + for (int i = 0; i < RANGE; i+=8) { + sum += 11 * data[i+0]; + sum += 11 * data[i+1]; + sum += 11 * data[i+2]; + sum += 11 * data[i+3]; + sum += 11 * data[i+4]; + sum += 11 * data[i+5]; + sum += 11 * data[i+6]; + sum += 11 * data[i+7]; + } + return sum; + } + + + static void init(int[] data) { + for (int i = 0; i < RANGE; i++) { + data[i] = i + 1; + } + } +} From 9291fb31ac6916dd85af860b035df9dce8fa2c55 Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Mon, 15 May 2023 12:57:37 +0200 Subject: [PATCH 17/19] whitespace fix --- .../compiler/loopopts/superword/TestUnorderedReduction.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java b/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java index 1c84750652b1e..d50dc96ffb301 100644 --- a/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java +++ b/test/hotspot/jtreg/compiler/loopopts/superword/TestUnorderedReduction.java @@ -48,7 +48,7 @@ public static void main(String[] args) { public void runTests() throws Exception { int[] data = new int[RANGE]; - init(data); + init(data); for (int i = 0; i < ITER; i++) { int r1 = test1(data, i); int r2 = ref1(data, i); @@ -65,7 +65,7 @@ public void runTests() throws Exception { } } } - + @Test @IR(counts = {IRNode.LOAD_VECTOR, "> 0", IRNode.ADD_VI, "= 0", @@ -104,7 +104,7 @@ static int ref1(int[] data, int sum) { } return sum + x; } - + @Test @IR(counts = {IRNode.LOAD_VECTOR, "> 0", IRNode.ADD_VI, "> 0", From e1af0966e3f5cd6fcee4a42a476b6c9cdf312c6c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 17 May 2023 12:35:00 +0200 Subject: [PATCH 18/19] Address review suggestion from @vnkozlov and @jatin-bhateja --- src/hotspot/share/opto/loopopts.cpp | 12 +- src/hotspot/share/opto/vectorIntrinsics.cpp | 2 +- src/hotspot/share/opto/vectornode.cpp | 123 +++++++++++++++++--- src/hotspot/share/opto/vectornode.hpp | 66 +---------- 4 files changed, 120 insertions(+), 83 deletions(-) diff --git a/src/hotspot/share/opto/loopopts.cpp b/src/hotspot/share/opto/loopopts.cpp index 2850fd8b879dd..34dc31cb3c64b 100644 --- a/src/hotspot/share/opto/loopopts.cpp +++ b/src/hotspot/share/opto/loopopts.cpp @@ -4188,7 +4188,10 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { BasicType bt = vec_t->element_basic_type(); const Type* bt_t = Type::get_const_basic_type(bt); - if (!last_ur->make_normal_vector_op_implemented(vec_t)) { + // Convert opcode from vector-reduction -> scalar -> normal-vector-op + const int sopc = VectorNode::scalar_opcode(last_ur->Opcode(), bt); + const int vopc = VectorNode::opcode(sopc, bt); + if (!Matcher::match_rule_supported_vector(vopc, vector_length, bt)) { DEBUG_ONLY( last_ur->dump(); ) assert(false, "do not have normal vector op for this reduction"); continue; // not implemented -> fails @@ -4252,8 +4255,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { } assert(first_ur != nullptr, "must have successfully terminated chain traversal"); - // Create vector of identity elements (zero for add, one for mul, etc) - Node* identity_scalar = ReductionNode::make_identity_input_for_reduction_from_vector_opc(_igvn, last_ur->Opcode(), bt); + Node* identity_scalar = ReductionNode::make_identity_con_scalar(_igvn, sopc, bt); set_ctrl(identity_scalar, C->root()); VectorNode* identity_vector = VectorNode::scalar2vector(identity_scalar, vector_length, bt_t); register_new_node(identity_vector, C->root()); @@ -4273,7 +4275,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { // Create vector_accumulator to replace current. Node* last_vector_accumulator = current->in(1); Node* vector_input = current->in(2); - VectorNode* vector_accumulator = current->make_normal_vector_op(last_vector_accumulator, vector_input, vec_t); + VectorNode* vector_accumulator = VectorNode::make(vopc, last_vector_accumulator, vector_input, vec_t); register_new_node(vector_accumulator, cl); _igvn.replace_node(current, vector_accumulator); VectorNode::trace_new_vector(vector_accumulator, "UnorderedReduction"); @@ -4285,7 +4287,7 @@ void PhaseIdealLoop::move_unordered_reduction_out_of_loop(IdealLoopTree* loop) { // Create post-loop reduction. Node* last_accumulator = phi->in(2); - Node* post_loop_reduction = ReductionNode::make_from_vopc(first_ur->Opcode(), nullptr, init, last_accumulator, bt); + Node* post_loop_reduction = ReductionNode::make(sopc, nullptr, init, last_accumulator, bt); // Take over uses of last_accumulator that are not in the loop. for (DUIterator i = last_accumulator->outs(); last_accumulator->has_out(i); i++) { diff --git a/src/hotspot/share/opto/vectorIntrinsics.cpp b/src/hotspot/share/opto/vectorIntrinsics.cpp index dd2e6807ba769..51b531e42c006 100644 --- a/src/hotspot/share/opto/vectorIntrinsics.cpp +++ b/src/hotspot/share/opto/vectorIntrinsics.cpp @@ -1536,7 +1536,7 @@ bool LibraryCallKit::inline_vector_reduction() { } } - Node* init = ReductionNode::make_identity_input_for_reduction_from_scalar_opc(gvn(), opc, elem_bt); + Node* init = ReductionNode::make_identity_con_scalar(gvn(), opc, elem_bt); Node* value = nullptr; if (mask == nullptr) { assert(!is_masked_op, "Masked op needs the mask value never null"); diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index 1784ef2a9e504..ed85079466e7f 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -34,7 +34,7 @@ //------------------------------VectorNode-------------------------------------- // Return the vector operator for the specified scalar operation -// and vector length. +// and basic type. int VectorNode::opcode(int sopc, BasicType bt) { switch (sopc) { case Op_AddI: @@ -274,6 +274,109 @@ int VectorNode::opcode(int sopc, BasicType bt) { } } +// Return the scalar opcode for the specified vector opcode +// and basic type. +int VectorNode::scalar_opcode(int sopc, BasicType bt) { + switch (sopc) { + case Op_AddReductionVI: + case Op_AddVI: + return Op_AddI; + case Op_AddReductionVL: + case Op_AddVL: + return Op_AddL; + case Op_MulReductionVI: + case Op_MulVI: + return Op_MulI; + case Op_MulReductionVL: + case Op_MulVL: + return Op_MulL; + case Op_AndReductionV: + case Op_AndV: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: + case T_BYTE: + case T_SHORT: + case T_INT: + return Op_AndI; + case T_LONG: + return Op_AndL; + default: + assert(false, "basic type not handled"); + return 0; + } + case Op_OrReductionV: + case Op_OrV: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: + case T_BYTE: + case T_SHORT: + case T_INT: + return Op_OrI; + case T_LONG: + return Op_OrL; + default: + assert(false, "basic type not handled"); + return 0; + } + case Op_XorReductionV: + case Op_XorV: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: + case T_BYTE: + case T_SHORT: + case T_INT: + return Op_XorI; + case T_LONG: + return Op_XorL; + default: + assert(false, "basic type not handled"); + return 0; + } + case Op_MinReductionV: + case Op_MinV: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: + assert(false, "boolean and char are signed, not implemented for Min"); + return 0; + case T_BYTE: + case T_SHORT: + case T_INT: + return Op_MinI; + case T_LONG: + return Op_MinL; + default: + assert(false, "basic type not handled"); + return 0; + } + case Op_MaxReductionV: + case Op_MaxV: + switch (bt) { + case T_BOOLEAN: + case T_CHAR: + assert(false, "boolean and char are signed, not implemented for Max"); + return 0; + case T_BYTE: + case T_SHORT: + case T_INT: + return Op_MaxI; + case T_LONG: + return Op_MaxL; + default: + assert(false, "basic type not handled"); + return 0; + } + default: + assert(false, + "Vector node %s is not handled in VectorNode::scalar_opcode", + NodeClassNames[sopc]); + return 0; // Unimplemented + } +} + int VectorNode::replicate_opcode(BasicType bt) { switch(bt) { case T_BOOLEAN: @@ -1265,17 +1368,13 @@ int ReductionNode::opcode(int opc, BasicType bt) { } // Return the appropriate reduction node. -ReductionNode* ReductionNode::make(int opc, Node* ctrl, Node* n1, Node* n2, BasicType bt) { +ReductionNode* ReductionNode::make(int opc, Node *ctrl, Node* n1, Node* n2, BasicType bt) { int vopc = opcode(opc, bt); // This method should not be called for unimplemented vectors. guarantee(vopc != opc, "Vector for '%s' is not implemented", NodeClassNames[opc]); - return ReductionNode::make_from_vopc(vopc, ctrl, n1, n2, bt); -} - -ReductionNode* ReductionNode::make_from_vopc(int vopc, Node* ctrl, Node* n1, Node* n2, BasicType bt) { switch (vopc) { case Op_AddReductionVI: return new AddReductionVINode(ctrl, n1, n2); case Op_AddReductionVL: return new AddReductionVLNode(ctrl, n1, n2); @@ -1402,16 +1501,10 @@ Node* VectorCastNode::Identity(PhaseGVN* phase) { return this; } -// Input opc of pre-reduction operation, eg AddI for AddReductionVI -Node* ReductionNode::make_identity_input_for_reduction_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt) { - int vopc = opcode(opc, bt); - guarantee(vopc != opc, "Vector reduction for '%s' is not implemented", NodeClassNames[opc]); - - return make_identity_input_for_reduction_from_vector_opc(gvn, vopc, bt); -} +Node* ReductionNode::make_identity_con_scalar(PhaseGVN& gvn, int sopc, BasicType bt) { + int vopc = opcode(sopc, bt); + guarantee(vopc != sopc, "Vector reduction for '%s' is not implemented", NodeClassNames[sopc]); -// Input opc of vector reduction, eg. AddReductionVI -Node* ReductionNode::make_identity_input_for_reduction_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt) { switch (vopc) { case Op_AndReductionV: switch (bt) { diff --git a/src/hotspot/share/opto/vectornode.hpp b/src/hotspot/share/opto/vectornode.hpp index 70d9af793fffb..8efa5b40dde81 100644 --- a/src/hotspot/share/opto/vectornode.hpp +++ b/src/hotspot/share/opto/vectornode.hpp @@ -92,7 +92,8 @@ class VectorNode : public TypeNode { static bool is_rotate_opcode(int opc); - static int opcode(int opc, BasicType bt); + static int opcode(int sopc, BasicType bt); // scalar_opc -> vector_opc + static int scalar_opcode(int vopc, BasicType bt); // vector_opc -> scalar_opc static int replicate_opcode(BasicType bt); // Limits on vector size (number of elements) for auto-vectorization. @@ -207,12 +208,10 @@ class ReductionNode : public Node { } static ReductionNode* make(int opc, Node* ctrl, Node* in1, Node* in2, BasicType bt); - static ReductionNode* make_from_vopc(int vopc, Node* ctrl, Node* in1, Node* in2, BasicType bt); static int opcode(int opc, BasicType bt); static bool implemented(int opc, uint vlen, BasicType bt); - // Make an identity element (zero for add, one for mul, etc) for opc of scalar/vector reduction. - static Node* make_identity_input_for_reduction_from_scalar_opc(PhaseGVN& gvn, int opc, BasicType bt); - static Node* make_identity_input_for_reduction_from_vector_opc(PhaseGVN& gvn, int vopc, BasicType bt); + // Make an identity scalar (zero for add, one for mul, etc) for scalar opc. + static Node* make_identity_con_scalar(PhaseGVN& gvn, int sopc, BasicType bt); virtual const Type* bottom_type() const { return _bottom_type; @@ -239,9 +238,6 @@ class UnorderedReductionNode : public ReductionNode { UnorderedReductionNode(Node * ctrl, Node* in1, Node* in2) : ReductionNode(ctrl, in1, in2) { init_class_id(Class_UnorderedReduction); } - - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) = 0; - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) = 0; }; //------------------------------AddReductionVINode-------------------------------------- @@ -250,12 +246,6 @@ class AddReductionVINode : public UnorderedReductionNode { public: AddReductionVINode(Node * ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new AddVINode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_AddVI, vt->length(), vt->element_basic_type()); - } }; //------------------------------AddReductionVLNode-------------------------------------- @@ -264,12 +254,6 @@ class AddReductionVLNode : public UnorderedReductionNode { public: AddReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new AddVLNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_AddVL, vt->length(), vt->element_basic_type()); - } }; //------------------------------AddReductionVFNode-------------------------------------- @@ -430,12 +414,6 @@ class MulReductionVINode : public UnorderedReductionNode { public: MulReductionVINode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new MulVINode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_MulVI, vt->length(), vt->element_basic_type()); - } }; //------------------------------MulReductionVLNode-------------------------------------- @@ -444,12 +422,6 @@ class MulReductionVLNode : public UnorderedReductionNode { public: MulReductionVLNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new MulVLNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_MulVL, vt->length(), vt->element_basic_type()); - } }; //------------------------------MulReductionVFNode-------------------------------------- @@ -793,12 +765,6 @@ class AndReductionVNode : public UnorderedReductionNode { public: AndReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new AndVNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_AndV, vt->length(), vt->element_basic_type()); - } }; //------------------------------OrVNode--------------------------------------- @@ -816,12 +782,6 @@ class OrReductionVNode : public UnorderedReductionNode { public: OrReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new OrVNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_OrV, vt->length(), vt->element_basic_type()); - } }; //------------------------------XorVNode--------------------------------------- @@ -839,12 +799,6 @@ class XorReductionVNode : public UnorderedReductionNode { public: XorReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new XorVNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_XorV, vt->length(), vt->element_basic_type()); - } }; //------------------------------MinReductionVNode-------------------------------------- @@ -853,12 +807,6 @@ class MinReductionVNode : public UnorderedReductionNode { public: MinReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new MinVNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_MinV, vt->length(), vt->element_basic_type()); - } }; //------------------------------MaxReductionVNode-------------------------------------- @@ -867,12 +815,6 @@ class MaxReductionVNode : public UnorderedReductionNode { public: MaxReductionVNode(Node *ctrl, Node* in1, Node* in2) : UnorderedReductionNode(ctrl, in1, in2) {} virtual int Opcode() const; - virtual VectorNode* make_normal_vector_op(Node* in1, Node* in2, const TypeVect* vt) { - return new MaxVNode(in1, in2, vt); - } - virtual bool make_normal_vector_op_implemented(const TypeVect* vt) { - return Matcher::match_rule_supported_vector(Op_MaxV, vt->length(), vt->element_basic_type()); - } }; //------------------------------CompressVNode-------------------------------------- From e3d99c9599dd63390086336967bac091570c8b5c Mon Sep 17 00:00:00 2001 From: Emanuel Peter Date: Wed, 17 May 2023 15:04:19 +0200 Subject: [PATCH 19/19] added missing float/double cases to VectorNode::scalar_opcode --- src/hotspot/share/opto/vectornode.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/hotspot/share/opto/vectornode.cpp b/src/hotspot/share/opto/vectornode.cpp index ed85079466e7f..52078b21b4665 100644 --- a/src/hotspot/share/opto/vectornode.cpp +++ b/src/hotspot/share/opto/vectornode.cpp @@ -348,6 +348,10 @@ int VectorNode::scalar_opcode(int sopc, BasicType bt) { return Op_MinI; case T_LONG: return Op_MinL; + case T_FLOAT: + return Op_MinF; + case T_DOUBLE: + return Op_MinD; default: assert(false, "basic type not handled"); return 0; @@ -365,6 +369,10 @@ int VectorNode::scalar_opcode(int sopc, BasicType bt) { return Op_MaxI; case T_LONG: return Op_MaxL; + case T_FLOAT: + return Op_MaxF; + case T_DOUBLE: + return Op_MaxD; default: assert(false, "basic type not handled"); return 0;