[GR-66145] Improve the intrinsification of Vector::slice

kittyoracle · kittyoracle · commit 4fd24a68f429 · 2025-09-06T19:54:05.000Z
PullRequest: graal/21422
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/amd64/vector/AMD64VectorShuffle.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/lir/amd64/vector/AMD64VectorShuffle.java
@@ -97,6 +97,7 @@
 import jdk.vm.ci.code.Register;
 import jdk.vm.ci.meta.AllocatableValue;
 import jdk.vm.ci.meta.JavaConstant;
+import jdk.vm.ci.meta.PlatformKind;
 import jdk.vm.ci.meta.Value;
 
 public class AMD64VectorShuffle {
@@ -360,6 +361,132 @@ private void emitBytePermute(CompilationResultBuilder crb, AMD64MacroAssembler m
         }
     }
 
+    /**
+     * A slice operation, see {@link jdk.graal.compiler.vector.nodes.amd64.AMD64SimdSliceNode}.
+     */
+    public static final class SliceOp extends AMD64LIRInstruction {
+        public static final LIRInstructionClass<SliceOp> TYPE = LIRInstructionClass.create(SliceOp.class);
+
+        @Def({OperandFlag.REG}) protected AllocatableValue result;
+        @Alive({OperandFlag.REG}) protected AllocatableValue src1;
+        @Alive({OperandFlag.REG}) protected AllocatableValue src2;
+        @Temp({OperandFlag.REG, OperandFlag.ILLEGAL}) protected AllocatableValue tmp1;
+        @Temp({OperandFlag.REG, OperandFlag.ILLEGAL}) protected AllocatableValue tmp2;
+        private final int originInBytes;
+        private final AMD64SIMDInstructionEncoding encoding;
+
+        public SliceOp(AMD64LIRGenerator gen, AllocatableValue result, AllocatableValue src1, AllocatableValue src2, int origin, AMD64SIMDInstructionEncoding encoding) {
+            super(TYPE);
+            AMD64Kind eKind = ((AMD64Kind) result.getPlatformKind()).getScalar();
+            this.result = result;
+            this.src1 = src1;
+            this.src2 = src2;
+            this.originInBytes = origin * eKind.getSizeInBytes();
+            this.encoding = encoding;
+            allocateTempIfNecessary(gen);
+        }
+
+        @Override
+        public void emitCode(CompilationResultBuilder crb, AMD64MacroAssembler masm) {
+            int resultSize = result.getPlatformKind().getSizeInBytes();
+            switch (resultSize) {
+                case 4 -> {
+                    if (src1.equals(src2) && originInBytes == 2) {
+                        VexRMIOp.VPSHUFLW.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src1), 0x1);
+                    } else {
+                        VexRMIOp.VPSHUFD.encoding(encoding).emit(masm, XMM, asRegister(tmp1), asRegister(src1), 0);
+                        VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes + 12);
+                    }
+                }
+                case 8 -> {
+                    if (src1.equals(src2) && originInBytes % 2 == 0) {
+                        int imm;
+                        if (originInBytes == 2) {
+                            imm = 0b00111001;
+                        } else if (originInBytes == 4) {
+                            imm = 0b01001110;
+                        } else {
+                            GraalError.guarantee(originInBytes == 6, "unexpected originInBytes %d", originInBytes);
+                            imm = 0b10010011;
+                        }
+                        VexRMIOp.VPSHUFLW.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src1), imm);
+                    } else {
+                        VexRMIOp.VPSHUFD.encoding(encoding).emit(masm, XMM, asRegister(tmp1), asRegister(src1), 0x40);
+                        VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes + 8);
+                    }
+                }
+                case 16 -> VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, XMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes);
+                case 32 -> {
+                    if (encoding == AMD64SIMDInstructionEncoding.VEX || originInBytes % Integer.BYTES != 0) {
+                        Register tmp = originInBytes == 16 ? asRegister(result) : asRegister(tmp1);
+                        if (encoding == AMD64SIMDInstructionEncoding.VEX) {
+                            VexRVMIOp.VPERM2I128.emit(masm, YMM, tmp, asRegister(src1), asRegister(src2), 0x21);
+                        } else {
+                            VexRVMIOp.EVALIGND.emit(masm, YMM, tmp, asRegister(src2), asRegister(src1), 4);
+                        }
+                        if (originInBytes < 16) {
+                            VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, YMM, asRegister(result), asRegister(tmp1), asRegister(src1), originInBytes);
+                        } else if (originInBytes > 16) {
+                            VexRVMIOp.VPALIGNR.encoding(encoding).emit(masm, YMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes - 16);
+                        }
+                    } else {
+                        VexRVMIOp.EVALIGND.emit(masm, YMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes / Integer.BYTES);
+                    }
+                }
+                case 64 -> {
+                    GraalError.guarantee(encoding == AMD64SIMDInstructionEncoding.EVEX, "unexpected encoding with 512-bit vector");
+                    if (originInBytes % 4 != 0) {
+                        if (originInBytes < 16) {
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 4);
+                            VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp1), asRegister(src1), originInBytes);
+                        } else if (originInBytes < 32) {
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 4);
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp2), asRegister(src2), asRegister(src1), 8);
+                            VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp2), asRegister(tmp1), originInBytes - 16);
+                        } else if (originInBytes < 48) {
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 8);
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp2), asRegister(src2), asRegister(src1), 12);
+                            VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(tmp2), asRegister(tmp1), originInBytes - 32);
+                        } else {
+                            VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(tmp1), asRegister(src2), asRegister(src1), 12);
+                            VexRVMIOp.EVPALIGNR.emit(masm, ZMM, asRegister(result), asRegister(src2), asRegister(tmp1), originInBytes - 48);
+                        }
+                    } else {
+                        VexRVMIOp.EVALIGND.emit(masm, ZMM, asRegister(result), asRegister(src2), asRegister(src1), originInBytes / Integer.BYTES);
+                    }
+                }
+                default -> GraalError.shouldNotReachHereUnexpectedValue(resultSize);
+            }
+        }
+
+        private void allocateTempIfNecessary(AMD64LIRGenerator gen) {
+            PlatformKind resultKind = result.getPlatformKind();
+            boolean needsTemp;
+            if (resultKind.getSizeInBytes() < XMM.getBytes()) {
+                needsTemp = !src1.equals(src2) || originInBytes % 2 != 0;
+            } else if (resultKind.getSizeInBytes() == XMM.getBytes()) {
+                needsTemp = false;
+            } else if (encoding == AMD64SIMDInstructionEncoding.VEX) {
+                needsTemp = true;
+            } else {
+                needsTemp = (originInBytes % Integer.BYTES != 0);
+            }
+            if (needsTemp) {
+                tmp1 = gen.newVariable(LIRKind.value(resultKind));
+            } else {
+                tmp1 = Value.ILLEGAL;
+            }
+
+            if (resultKind.getSizeInBytes() == ZMM.getBytes() && originInBytes % Integer.BYTES != 0 &&
+                            originInBytes > 16 && originInBytes < 48) {
+                GraalError.guarantee(!tmp1.equals(Value.ILLEGAL), "must have tmp1 with originInBytes = %d", originInBytes);
+                tmp2 = gen.newVariable(LIRKind.value(resultKind));
+            } else {
+                tmp2 = Value.ILLEGAL;
+            }
+        }
+    }
+
     public static final class IntToVectorOp extends AMD64LIRInstruction {
         public static final LIRInstructionClass<IntToVectorOp> TYPE = LIRInstructionClass.create(IntToVectorOp.class);
 
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/lir/amd64/AMD64VectorArithmeticLIRGenerator.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/lir/amd64/AMD64VectorArithmeticLIRGenerator.java
@@ -510,4 +510,14 @@ public Value emitVectorPermute(LIRKind resultKind, Value source, Value indices)
         getLIRGen().append(AMD64VectorShuffle.PermuteOp.create(getAMD64LIRGen(), result, asAllocatable(source), asAllocatable(indices), getSimdEncoding()));
         return result;
     }
+
+    /**
+     * Do a slice operation, see
+     * {@code jdk.incubator.vector.Vector<E>::slice(int, jdk.incubator.vector.Vector<E>)}.
+     */
+    public Value emitVectorSlice(LIRKind resultKind, Value src1, Value src2, int origin) {
+        Variable result = getLIRGen().newVariable(resultKind);
+        getLIRGen().append(new AMD64VectorShuffle.SliceOp(getAMD64LIRGen(), result, asAllocatable(src1), asAllocatable(src2), origin, getSimdEncoding()));
+        return result;
+    }
 }
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/amd64/AMD64SimdSliceNode.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/amd64/AMD64SimdSliceNode.java
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2025, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+package jdk.graal.compiler.vector.nodes.amd64;
+
+import jdk.graal.compiler.core.common.LIRKind;
+import jdk.graal.compiler.debug.GraalError;
+import jdk.graal.compiler.graph.NodeClass;
+import jdk.graal.compiler.nodeinfo.NodeCycles;
+import jdk.graal.compiler.nodeinfo.NodeInfo;
+import jdk.graal.compiler.nodeinfo.NodeSize;
+import jdk.graal.compiler.nodes.NodeView;
+import jdk.graal.compiler.nodes.ValueNode;
+import jdk.graal.compiler.nodes.calc.FloatingNode;
+import jdk.graal.compiler.nodes.spi.NodeLIRBuilderTool;
+import jdk.graal.compiler.vector.lir.VectorLIRGeneratorTool;
+import jdk.graal.compiler.vector.lir.VectorLIRLowerable;
+import jdk.graal.compiler.vector.lir.amd64.AMD64VectorArithmeticLIRGenerator;
+import jdk.graal.compiler.vector.nodes.simd.SimdStamp;
+
+/**
+ * A slice operation concatenates its inputs into a sequence of {@code VLENGTH * 2} elements, then
+ * {@code VLENGTH} elements are collected starting at index {@link #origin} to form the result. If
+ * the 2 inputs are the same, then this operation is the same as rotating the input left by
+ * {@link #origin} elements.
+ */
+@NodeInfo(cycles = NodeCycles.CYCLES_1, size = NodeSize.SIZE_1)
+public class AMD64SimdSliceNode extends FloatingNode implements VectorLIRLowerable {
+
+    public static final NodeClass<AMD64SimdSliceNode> TYPE = NodeClass.create(AMD64SimdSliceNode.class);
+
+    @Input protected ValueNode src1;
+    @Input protected ValueNode src2;
+    private final int origin;
+
+    protected AMD64SimdSliceNode(SimdStamp stamp, ValueNode src1, ValueNode src2, int origin) {
+        super(TYPE, stamp);
+        this.src1 = src1;
+        this.src2 = src2;
+        this.origin = origin;
+    }
+
+    public static AMD64SimdSliceNode create(ValueNode src1, ValueNode src2, int origin) {
+        GraalError.guarantee(src1.stamp(NodeView.DEFAULT) instanceof SimdStamp, "unexpected input stamp %s", src1);
+        SimdStamp stamp = (SimdStamp) src1.stamp(NodeView.DEFAULT).unrestricted();
+        GraalError.guarantee(stamp.isCompatible(src2.stamp(NodeView.DEFAULT)), "unexpected input stamps: %s, %s", src1, src2);
+        GraalError.guarantee(origin > 0 && origin < stamp.getVectorLength(), "unexpected origin %d of vector input %s", origin, src1);
+        return new AMD64SimdSliceNode(stamp, src1, src2, origin);
+    }
+
+    public ValueNode getSrc1() {
+        return src1;
+    }
+
+    public ValueNode getSrc2() {
+        return src2;
+    }
+
+    public int getOrigin() {
+        return origin;
+    }
+
+    @Override
+    public void generate(NodeLIRBuilderTool builder, VectorLIRGeneratorTool gen) {
+        LIRKind resultKind = builder.getLIRGeneratorTool().getLIRKind(stamp);
+        builder.setResult(this, ((AMD64VectorArithmeticLIRGenerator) gen).emitVectorSlice(resultKind, builder.operand(src1), builder.operand(src2), origin));
+    }
+}
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/simd/SimdBlendWithConstantMaskNode.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/simd/SimdBlendWithConstantMaskNode.java
@@ -89,6 +89,10 @@ public Stamp foldStamp(Stamp falseStamp, Stamp trueStamp) {
 
     @Override
     public Node canonical(CanonicalizerTool tool, ValueNode falseValues, ValueNode trueValues) {
+        if (falseValues == trueValues) {
+            return falseValues;
+        }
+
         boolean allSecond = true;
         boolean allFirst = true;
         for (int i = 0; (allSecond || allFirst) && i < selector.length; ++i) {
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/simd/SimdBlendWithLogicMaskNode.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/nodes/simd/SimdBlendWithLogicMaskNode.java
@@ -86,6 +86,10 @@ public Stamp foldStamp(Stamp stampX, Stamp stampY, Stamp stampZ) {
 
     @Override
     public Node canonical(CanonicalizerTool tool, ValueNode forX, ValueNode forY, ValueNode forZ) {
+        if (forX == forY) {
+            return forX;
+        }
+
         ValueNode mask = forZ;
         Stamp toStamp = mask.stamp(NodeView.DEFAULT);
         while (mask instanceof ReinterpretNode simdMask && SimdStamp.isOpmask(simdMask.stamp(NodeView.DEFAULT))) {
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/phases/amd64/AMD64VectorLoweringPhase.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/phases/amd64/AMD64VectorLoweringPhase.java
diff --git a/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/replacements/vectorapi/nodes/VectorAPIRearrangeOpNode.java b/compiler/src/jdk.graal.compiler/src/jdk/graal/compiler/vector/replacements/vectorapi/nodes/VectorAPIRearrangeOpNode.java