Skip to content

Conversation

@ylzsx
Copy link
Contributor

@ylzsx ylzsx commented Oct 15, 2025

On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code.

On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8,
which is LSX-sized register. In most cases we actually compare or select
LASX-sized registers and mixing the two types creates horrible code.
@llvmbot
Copy link
Member

llvmbot commented Oct 15, 2025

@llvm/pr-subscribers-backend-loongarch

Author: Zhaoxin Yang (ylzsx)

Changes

On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code.


Patch is 37.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163523.diff

2 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+125-1)
  • (modified) llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll (+83-583)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeafc9ccfc..509ae3f0c5e1a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -466,8 +466,12 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
 
   // Set DAG combine for 'LASX' feature.
 
-  if (Subtarget.hasExtLASX())
+  if (Subtarget.hasExtLASX()) {
     setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::ZERO_EXTEND);
+    setTargetDAGCombine(ISD::SIGN_EXTEND);
+  }
 
   // Compute derived properties from the register classes.
   computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -6679,6 +6683,122 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+//   or (and (truncate x, truncate y)),
+//      (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+//   or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
+                                     SelectionDAG &DAG,
+                                     const LoongArchSubtarget &Subtarget,
+                                     unsigned Depth) {
+  // Limit recursion to avoid excessive compile times.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  if (!ISD::isBitwiseLogicOp(N.getOpcode()))
+    return SDValue();
+
+  SDValue N0 = N.getOperand(0);
+  SDValue N1 = N.getOperand(1);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
+    return SDValue();
+
+  if (SDValue NN0 =
+          PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
+    N0 = NN0;
+  else {
+    // The left side has to be a 'trunc'.
+    bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+                    N0.getOperand(0).getValueType() == VT;
+    if (LHSTrunc)
+      N0 = N0.getOperand(0);
+    else
+      return SDValue();
+  }
+
+  if (SDValue NN1 =
+          PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
+    N1 = NN1;
+  else {
+    // The right side has to be a 'trunc', a (foldable) constant or an
+    // existing extension we can extend further.
+    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+                    N1.getOperand(0).getValueType() == VT;
+    if (RHSTrunc)
+      N1 = N1.getOperand(0);
+    else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+             Subtarget.hasExtLASX() && N1.hasOneUse())
+      N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
+    // On 32-bit platform, i64 is an illegal integer scalar type, and
+    // FoldConstantArithmetic will fail for v4i64. This may be optimized in the
+    // future.
+    else if (SDValue Cst =
+                 DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
+      N1 = Cst;
+    else
+      return SDValue();
+  }
+
+  return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
+}
+
+// On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which
+// is LSX-sized register. In most cases we actually compare or select LASX-sized
+// registers and mixing the two types creates horrible code. This method
+// optimizes some of the transition sequences.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
+                                     SelectionDAG &DAG,
+                                     const LoongArchSubtarget &Subtarget) {
+  EVT VT = N.getValueType();
+  assert(VT.isVector() && "Expected vector type");
+  assert((N.getOpcode() == ISD::ANY_EXTEND ||
+          N.getOpcode() == ISD::ZERO_EXTEND ||
+          N.getOpcode() == ISD::SIGN_EXTEND) &&
+         "Invalid Node");
+
+  if (!Subtarget.hasExtLASX() || !VT.is256BitVector())
+    return SDValue();
+
+  SDValue Narrow = N.getOperand(0);
+  EVT NarrowVT = Narrow.getValueType();
+
+  // Generate the wide operation.
+  SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
+  if (!Op)
+    return SDValue();
+  switch (N.getOpcode()) {
+  default:
+    llvm_unreachable("Unexpected opcode");
+  case ISD::ANY_EXTEND:
+    return Op;
+  case ISD::ZERO_EXTEND:
+    return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
+  case ISD::SIGN_EXTEND:
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+                       DAG.getValueType(NarrowVT));
+  }
+}
+
+static SDValue performANY_EXTENDCombine(SDNode *N, SelectionDAG &DAG,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const LoongArchSubtarget &Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (VT.isVector())
+    if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
+      return R;
+
+  return SDValue();
+}
+
 SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
                                                    DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6695,6 +6815,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
     return performSRLCombine(N, DAG, DCI, Subtarget);
   case ISD::BITCAST:
     return performBITCASTCombine(N, DAG, DCI, Subtarget);
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND:
+  case ISD::SIGN_EXTEND:
+    return performANY_EXTENDCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::BITREV_W:
     return performBITREV_WCombine(N, DAG, DCI, Subtarget);
   case LoongArchISD::BR_CC:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
index cd98ba7e4083c..59757c27bd020 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
@@ -31,28 +31,12 @@ define void @xor_zext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    xvld $xr0, $a1, 0
 ; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; LA64-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI0_0)
 ; LA64-NEXT:    xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT:    vldi $vr0, -1777
-; LA64-NEXT:    vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT:    vpickve2gr.w $a1, $vr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 0
-; LA64-NEXT:    vpickve2gr.w $a1, $vr0, 3
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT:    vpickve2gr.w $a1, $vr0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT:    vpickve2gr.w $a1, $vr0, 1
-; LA64-NEXT:    vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT:    xvrepli.d $xr0, 1
-; LA64-NEXT:    xvand.v $xr0, $xr2, $xr0
+; LA64-NEXT:    xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT:    xvrepli.d $xr1, 1
+; LA64-NEXT:    xvand.v $xr0, $xr0, $xr1
 ; LA64-NEXT:    xvst $xr0, $a0, 0
 ; LA64-NEXT:    ret
   %v0 = load <4 x double>, ptr %a
@@ -70,43 +54,10 @@ define void @xor_zext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
 ; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr0, $xr1
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 5
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vldi $vr0, -2305
-; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT:    xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT:    xvrepli.w $xr0, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvldi $xr1, -1789
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvrepli.w $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <8 x float>, ptr %a
@@ -124,76 +75,10 @@ define void @xor_zext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
 ; CHECK-NEXT:    xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 7
-; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 9
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 11
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 13
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 15
-; CHECK-NEXT:    vrepli.h $vr0, 255
-; CHECK-NEXT:    vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 0
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 1
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 2
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 3
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 4
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 5
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 6
-; CHECK-NEXT:    vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT:    vinsgr2vr.h $vr2, $a1, 7
-; CHECK-NEXT:    xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT:    xvrepli.h $xr0, 1
-; CHECK-NEXT:    xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT:    xvrepli.w $xr1, 255
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvrepli.h $xr1, 1
+; CHECK-NEXT:    xvand.v $xr0, $xr0, $xr1
 ; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
   %v0 = load <16 x i16>, ptr %a
@@ -238,22 +123,12 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
 ; LA64:       # %bb.0:
 ; LA64-NEXT:    xvld $xr0, $a1, 0
 ; LA64-NEXT:    xvld $xr1, $a2, 0
+; LA64-NEXT:    pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; LA64-NEXT:    xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
 ; LA64-NEXT:    xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT:    xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.d $a2, $xr0, 3
-; LA64-NEXT:    vrepli.b $vr0, -1
-; LA64-NEXT:    vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT:    vpickve2gr.w $a3, $vr0, 2
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a3, 0
-; LA64-NEXT:    vinsgr2vr.d $vr1, $a2, 1
-; LA64-NEXT:    vpickve2gr.w $a2, $vr0, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a2, 0
-; LA64-NEXT:    vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT:    xvpermi.q $xr0, $xr1, 2
+; LA64-NEXT:    xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT:    xvslli.d $xr0, $xr0, 32
+; LA64-NEXT:    xvsrai.d $xr0, $xr0, 32
 ; LA64-NEXT:    xvst $xr0, $a0, 0
 ; LA64-NEXT:    ret
   %v0 = load <4 x double>, ptr %a
@@ -266,87 +141,17 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
 }
 
 define void @xor_sext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xor_sext_masks_v8i32:
-; LA32:       # %bb.0:
-; LA32-NEXT:    xvld $xr0, $a1, 0
-; LA32-NEXT:    xvld $xr1, $a2, 0
-; LA32-NEXT:    xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; LA32-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT:    vrepli.b $vr0, -1
-; LA32-NEXT:    vxor.v $vr0, $vr1, $vr0
-; LA32-NEXT:    vpickve2gr.h $a5, $vr0, 4
-; LA32-NEXT:    ext.w.h $a5, $a5
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a5, 0
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 1
-; LA32-NEXT:    vpickve2gr.h $a4, $vr0, 6
-; LA32-NEXT:    ext.w.h $a4, $a4
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a4, 2
-; LA32-NEXT:    vinsgr2vr.w $vr1, $a3, 3
-; LA32-NEXT:    vpickve2gr.h $a3, $vr0, 0
-; LA32-NEXT:    ext.w.h $a3, $a3
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a3, 0
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT:    vpickve2gr.h $a2, $vr0, 2
-; LA32-NEXT:    ext.w.h $a2, $a2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a2, 2
-; LA32-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT:    xvst $xr2, $a0, 0
-; LA32-NEXT:    ret
-;
-; LA64-LABEL: xor_sext_masks_v8i32:
-; LA64:       # %bb.0:
-; LA64-NEXT:    xvld $xr0, $a1, 0
-; LA64-NEXT:    xvld $xr1, $a2, 0
-; LA64-NEXT:    xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 0
-; LA64-NEXT:    vinsgr2vr.h $vr1, $a1, 0
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT:    vinsgr2vr.h $vr1, $a1, 2
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 4
-; LA64-NEXT:    vinsgr2vr.h $vr1, $a1, 4
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 6
-; LA64-NEXT:    vinsgr2vr.h $vr1, $a1, 6
-; LA64-NEXT:    xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT:    xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT:    xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT:    xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT:    vrepli.b $vr0, -1
-; LA64-NEXT:    vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT:    vpickve2gr.h $a5, $vr0, 4
-; LA64-NEXT:    ext.w.h $a5, $a5
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a5, 0
-; LA64-NEXT:    ext.w.h $a4, $a4
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 1
-; LA64-NEXT:    vpickve2gr.h $a4, $vr0, 6
-; LA64-NEXT:    ext.w.h $a4, $a4
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a4, 2
-; LA64-NEXT:    ext.w.h $a3, $a3
-; LA64-NEXT:    vinsgr2vr.w $vr1, $a3, 3
-; LA64-NEXT:    vpickve2gr.h $a3, $vr0, 0
-; LA64-NEXT:    ext.w.h $a3, $a3
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a3, 0
-; LA64-NEXT:    ext.w.h $a2, $a2
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a2, 1
-; LA64-NEXT:    vpickve2gr.h $a2, $vr0, 2
-; LA64-NEXT:    ext.w.h $a2, $a2
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a2, 2
-; LA64-NEXT:    ext.w.h $a1, $a1
-; LA64-NEXT:    vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT:    xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT:    xvst $xr2, $a0, 0
-; LA64-NEXT:    ret
+; CHECK-LABEL: xor_sext_masks_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvld $xr1, $a2, 0
+; CHECK-NEXT:    xvfcmp.clt.s $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvldi $xr1, -1789
+; CHECK-NEXT:    xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvslli.w $xr0, $xr0, 16
+; CHECK-NEXT:    xvsrai.w $xr0, $xr0, 16
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
   %v0 = load <8 x float>, ptr %a
   %v1 = load <8 x float>, ptr %b
   %m0 = fcmp olt <8 x float> %v0, %v1
@@ -362,74 +167,11 @@ define void @xor_sext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
 ; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvld $xr1, $a2, 0
 ; CHECK-NEXT:    xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 0
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 2
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 4
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr2, 6
-; CHECK-NEXT:    vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT:    vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT:    vpickve2gr.h $a2, $vr0, 5
-; CHECK-NEXT:    vpickve2gr.h $a3, $vr0, 3
-; CHECK-NEXT:    vpickve2gr.h $a4, $vr0, 1
-; CHECK-NEXT:    vpickve2gr.h $a5, $vr2, 7
-; CHECK-NEXT:    vpickve2gr.h $a6, $vr2, 5
-; CHECK-NEXT:    vpickve2gr.h $a7, $vr2, 3
-; CHECK-NEXT:    vpickve2gr.h $t0, $vr2, 1
-; CHECK-NEXT:    vxori.b $vr0, $vr1, 255
-; CHECK-NEXT:    vpickve2gr.b $t1, $vr0, 8
-; CHECK-NEXT:    ext.w.b $t1, $t1
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $t1, 0
-; CHECK-NEXT:    ext.w.b $t0, $t0
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 1
-; CHECK-NEXT:    vpickve2gr.b $t0, $vr0, 10
-; CHECK-NEXT:    ext.w.b $t0, $t0
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $t0, 2
-; CHECK-NEXT:    ext.w.b $a7, $a7
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a7, 3
-; CHECK-NEXT:    vpickve2gr.b $a7, $vr0, 12
-; CHECK-NEXT:    ext.w.b $a7, $a7
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a7, 4
-; CHECK-NEXT:    ext.w.b $a6, $a6
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a6, 5
-; CHECK-NEXT:    vpickve2gr.b $a6, $vr0, 14
-; CHECK-NEXT:    ext.w.b $a6, $a6
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a6, 6
-; CHECK-NEXT:    ext.w.b $a5, $a5
-; CHECK-NEXT:    vinsgr2vr.h $vr1, $a5, 7
-; CHECK-NEXT:    vpickve2gr.b $a5, $vr0, 0
-; CHECK-NEXT:    ext.w.b ...
[truncated]

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants