-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[LoongArch] Try to avoid casts around logical vector ops on lasx #163523
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/ylzsx/precommit-vxi1-masks
Are you sure you want to change the base?
[LoongArch] Try to avoid casts around logical vector ops on lasx #163523
Conversation
On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code.
|
@llvm/pr-subscribers-backend-loongarch Author: Zhaoxin Yang (ylzsx) ChangesOn LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code. Patch is 37.27 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163523.diff 2 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeafc9ccfc..509ae3f0c5e1a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -466,8 +466,12 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
// Set DAG combine for 'LASX' feature.
- if (Subtarget.hasExtLASX())
+ if (Subtarget.hasExtLASX()) {
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+ setTargetDAGCombine(ISD::ANY_EXTEND);
+ setTargetDAGCombine(ISD::ZERO_EXTEND);
+ setTargetDAGCombine(ISD::SIGN_EXTEND);
+ }
// Compute derived properties from the register classes.
computeRegisterProperties(Subtarget.getRegisterInfo());
@@ -6679,6 +6683,122 @@ performEXTRACT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+// or (and (truncate x, truncate y)),
+// (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+// or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants, can
+// be recursively promoted or is an existing extension we can extend further.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL, EVT VT,
+ SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget,
+ unsigned Depth) {
+ // Limit recursion to avoid excessive compile times.
+ if (Depth >= SelectionDAG::MaxRecursionDepth)
+ return SDValue();
+
+ if (!ISD::isBitwiseLogicOp(N.getOpcode()))
+ return SDValue();
+
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ if (!TLI.isOperationLegalOrPromote(N.getOpcode(), VT))
+ return SDValue();
+
+ if (SDValue NN0 =
+ PromoteMaskArithmetic(N0, DL, VT, DAG, Subtarget, Depth + 1))
+ N0 = NN0;
+ else {
+ // The left side has to be a 'trunc'.
+ bool LHSTrunc = N0.getOpcode() == ISD::TRUNCATE &&
+ N0.getOperand(0).getValueType() == VT;
+ if (LHSTrunc)
+ N0 = N0.getOperand(0);
+ else
+ return SDValue();
+ }
+
+ if (SDValue NN1 =
+ PromoteMaskArithmetic(N1, DL, VT, DAG, Subtarget, Depth + 1))
+ N1 = NN1;
+ else {
+ // The right side has to be a 'trunc', a (foldable) constant or an
+ // existing extension we can extend further.
+ bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+ N1.getOperand(0).getValueType() == VT;
+ if (RHSTrunc)
+ N1 = N1.getOperand(0);
+ else if (ISD::isExtVecInRegOpcode(N1.getOpcode()) && VT.is256BitVector() &&
+ Subtarget.hasExtLASX() && N1.hasOneUse())
+ N1 = DAG.getNode(N1.getOpcode(), DL, VT, N1.getOperand(0));
+ // On 32-bit platform, i64 is an illegal integer scalar type, and
+ // FoldConstantArithmetic will fail for v4i64. This may be optimized in the
+ // future.
+ else if (SDValue Cst =
+ DAG.FoldConstantArithmetic(ISD::ZERO_EXTEND, DL, VT, {N1}))
+ N1 = Cst;
+ else
+ return SDValue();
+ }
+
+ return DAG.getNode(N.getOpcode(), DL, VT, N0, N1);
+}
+
+// On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which
+// is LSX-sized register. In most cases we actually compare or select LASX-sized
+// registers and mixing the two types creates horrible code. This method
+// optimizes some of the transition sequences.
+static SDValue PromoteMaskArithmetic(SDValue N, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const LoongArchSubtarget &Subtarget) {
+ EVT VT = N.getValueType();
+ assert(VT.isVector() && "Expected vector type");
+ assert((N.getOpcode() == ISD::ANY_EXTEND ||
+ N.getOpcode() == ISD::ZERO_EXTEND ||
+ N.getOpcode() == ISD::SIGN_EXTEND) &&
+ "Invalid Node");
+
+ if (!Subtarget.hasExtLASX() || !VT.is256BitVector())
+ return SDValue();
+
+ SDValue Narrow = N.getOperand(0);
+ EVT NarrowVT = Narrow.getValueType();
+
+ // Generate the wide operation.
+ SDValue Op = PromoteMaskArithmetic(Narrow, DL, VT, DAG, Subtarget, 0);
+ if (!Op)
+ return SDValue();
+ switch (N.getOpcode()) {
+ default:
+ llvm_unreachable("Unexpected opcode");
+ case ISD::ANY_EXTEND:
+ return Op;
+ case ISD::ZERO_EXTEND:
+ return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
+ case ISD::SIGN_EXTEND:
+ return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
+ DAG.getValueType(NarrowVT));
+ }
+}
+
+static SDValue performANY_EXTENDCombine(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const LoongArchSubtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+ SDLoc DL(N);
+
+ if (VT.isVector())
+ if (SDValue R = PromoteMaskArithmetic(SDValue(N, 0), DL, DAG, Subtarget))
+ return R;
+
+ return SDValue();
+}
+
SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
SelectionDAG &DAG = DCI.DAG;
@@ -6695,6 +6815,10 @@ SDValue LoongArchTargetLowering::PerformDAGCombine(SDNode *N,
return performSRLCombine(N, DAG, DCI, Subtarget);
case ISD::BITCAST:
return performBITCASTCombine(N, DAG, DCI, Subtarget);
+ case ISD::ANY_EXTEND:
+ case ISD::ZERO_EXTEND:
+ case ISD::SIGN_EXTEND:
+ return performANY_EXTENDCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::BITREV_W:
return performBITREV_WCombine(N, DAG, DCI, Subtarget);
case LoongArchISD::BR_CC:
diff --git a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
index cd98ba7e4083c..59757c27bd020 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/vxi1-masks.ll
@@ -31,28 +31,12 @@ define void @xor_zext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI0_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI0_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 3
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; LA64-NEXT: vldi $vr0, -1777
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 2
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 3
-; LA64-NEXT: vinsgr2vr.d $vr1, $a1, 1
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 0
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 0
-; LA64-NEXT: vpickve2gr.w $a1, $vr0, 1
-; LA64-NEXT: vinsgr2vr.d $vr2, $a1, 1
-; LA64-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT: xvrepli.d $xr0, 1
-; LA64-NEXT: xvand.v $xr0, $xr2, $xr0
+; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvrepli.d $xr1, 1
+; LA64-NEXT: xvand.v $xr0, $xr0, $xr1
; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
@@ -70,43 +54,10 @@ define void @xor_zext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: xvpickve2gr.w $a1, $xr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vldi $vr0, -2305
-; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.w $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.w $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvldi $xr1, -1789
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.w $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
@@ -124,76 +75,10 @@ define void @xor_zext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 7
-; CHECK-NEXT: xvpermi.d $xr0, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 9
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 11
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 13
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 15
-; CHECK-NEXT: vrepli.h $vr0, 255
-; CHECK-NEXT: vxor.v $vr0, $vr1, $vr0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 8
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 9
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 10
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 11
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 12
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 13
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 14
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 15
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a1, 7
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 0
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 1
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 1
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 2
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 3
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 3
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 4
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 5
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 5
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 6
-; CHECK-NEXT: vpickve2gr.b $a1, $vr0, 7
-; CHECK-NEXT: vinsgr2vr.h $vr2, $a1, 7
-; CHECK-NEXT: xvpermi.q $xr2, $xr1, 2
-; CHECK-NEXT: xvrepli.h $xr0, 1
-; CHECK-NEXT: xvand.v $xr0, $xr2, $xr0
+; CHECK-NEXT: xvrepli.w $xr1, 255
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvrepli.h $xr1, 1
+; CHECK-NEXT: xvand.v $xr0, $xr0, $xr1
; CHECK-NEXT: xvst $xr0, $a0, 0
; CHECK-NEXT: ret
%v0 = load <16 x i16>, ptr %a
@@ -238,22 +123,12 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
; LA64: # %bb.0:
; LA64-NEXT: xvld $xr0, $a1, 0
; LA64-NEXT: xvld $xr1, $a2, 0
+; LA64-NEXT: pcalau12i $a1, %pc_hi20(.LCPI3_0)
+; LA64-NEXT: xvld $xr2, $a1, %pc_lo12(.LCPI3_0)
; LA64-NEXT: xvfcmp.clt.d $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.w $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.d $a1, $xr0, 1
-; LA64-NEXT: xvpickve2gr.d $a2, $xr0, 3
-; LA64-NEXT: vrepli.b $vr0, -1
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.w $a3, $vr0, 2
-; LA64-NEXT: vinsgr2vr.d $vr1, $a3, 0
-; LA64-NEXT: vinsgr2vr.d $vr1, $a2, 1
-; LA64-NEXT: vpickve2gr.w $a2, $vr0, 0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a2, 0
-; LA64-NEXT: vinsgr2vr.d $vr0, $a1, 1
-; LA64-NEXT: xvpermi.q $xr0, $xr1, 2
+; LA64-NEXT: xvxor.v $xr0, $xr0, $xr2
+; LA64-NEXT: xvslli.d $xr0, $xr0, 32
+; LA64-NEXT: xvsrai.d $xr0, $xr0, 32
; LA64-NEXT: xvst $xr0, $a0, 0
; LA64-NEXT: ret
%v0 = load <4 x double>, ptr %a
@@ -266,87 +141,17 @@ define void @xor_sext_masks_v4i64(ptr %res, ptr %a, ptr %b) nounwind {
}
define void @xor_sext_masks_v8i32(ptr %res, ptr %a, ptr %b) nounwind {
-; LA32-LABEL: xor_sext_masks_v8i32:
-; LA32: # %bb.0:
-; LA32-NEXT: xvld $xr0, $a1, 0
-; LA32-NEXT: xvld $xr1, $a2, 0
-; LA32-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; LA32-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; LA32-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; LA32-NEXT: xvpickve2gr.w $a2, $xr0, 1
-; LA32-NEXT: xvpickve2gr.w $a3, $xr0, 7
-; LA32-NEXT: xvpickve2gr.w $a4, $xr0, 5
-; LA32-NEXT: vrepli.b $vr0, -1
-; LA32-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA32-NEXT: vpickve2gr.h $a5, $vr0, 4
-; LA32-NEXT: ext.w.h $a5, $a5
-; LA32-NEXT: vinsgr2vr.w $vr1, $a5, 0
-; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 1
-; LA32-NEXT: vpickve2gr.h $a4, $vr0, 6
-; LA32-NEXT: ext.w.h $a4, $a4
-; LA32-NEXT: vinsgr2vr.w $vr1, $a4, 2
-; LA32-NEXT: vinsgr2vr.w $vr1, $a3, 3
-; LA32-NEXT: vpickve2gr.h $a3, $vr0, 0
-; LA32-NEXT: ext.w.h $a3, $a3
-; LA32-NEXT: vinsgr2vr.w $vr2, $a3, 0
-; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 1
-; LA32-NEXT: vpickve2gr.h $a2, $vr0, 2
-; LA32-NEXT: ext.w.h $a2, $a2
-; LA32-NEXT: vinsgr2vr.w $vr2, $a2, 2
-; LA32-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; LA32-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA32-NEXT: xvst $xr2, $a0, 0
-; LA32-NEXT: ret
-;
-; LA64-LABEL: xor_sext_masks_v8i32:
-; LA64: # %bb.0:
-; LA64-NEXT: xvld $xr0, $a1, 0
-; LA64-NEXT: xvld $xr1, $a2, 0
-; LA64-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 0
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 0
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 2
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 2
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 4
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 4
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 6
-; LA64-NEXT: vinsgr2vr.h $vr1, $a1, 6
-; LA64-NEXT: xvpickve2gr.w $a1, $xr0, 3
-; LA64-NEXT: xvpickve2gr.w $a2, $xr0, 1
-; LA64-NEXT: xvpickve2gr.w $a3, $xr0, 7
-; LA64-NEXT: xvpickve2gr.w $a4, $xr0, 5
-; LA64-NEXT: vrepli.b $vr0, -1
-; LA64-NEXT: vxor.v $vr0, $vr1, $vr0
-; LA64-NEXT: vpickve2gr.h $a5, $vr0, 4
-; LA64-NEXT: ext.w.h $a5, $a5
-; LA64-NEXT: vinsgr2vr.w $vr1, $a5, 0
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 1
-; LA64-NEXT: vpickve2gr.h $a4, $vr0, 6
-; LA64-NEXT: ext.w.h $a4, $a4
-; LA64-NEXT: vinsgr2vr.w $vr1, $a4, 2
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: vinsgr2vr.w $vr1, $a3, 3
-; LA64-NEXT: vpickve2gr.h $a3, $vr0, 0
-; LA64-NEXT: ext.w.h $a3, $a3
-; LA64-NEXT: vinsgr2vr.w $vr2, $a3, 0
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: vinsgr2vr.w $vr2, $a2, 1
-; LA64-NEXT: vpickve2gr.h $a2, $vr0, 2
-; LA64-NEXT: ext.w.h $a2, $a2
-; LA64-NEXT: vinsgr2vr.w $vr2, $a2, 2
-; LA64-NEXT: ext.w.h $a1, $a1
-; LA64-NEXT: vinsgr2vr.w $vr2, $a1, 3
-; LA64-NEXT: xvpermi.q $xr2, $xr1, 2
-; LA64-NEXT: xvst $xr2, $a0, 0
-; LA64-NEXT: ret
+; CHECK-LABEL: xor_sext_masks_v8i32:
+; CHECK: # %bb.0:
+; CHECK-NEXT: xvld $xr0, $a1, 0
+; CHECK-NEXT: xvld $xr1, $a2, 0
+; CHECK-NEXT: xvfcmp.clt.s $xr0, $xr0, $xr1
+; CHECK-NEXT: xvldi $xr1, -1789
+; CHECK-NEXT: xvxor.v $xr0, $xr0, $xr1
+; CHECK-NEXT: xvslli.w $xr0, $xr0, 16
+; CHECK-NEXT: xvsrai.w $xr0, $xr0, 16
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
%v0 = load <8 x float>, ptr %a
%v1 = load <8 x float>, ptr %b
%m0 = fcmp olt <8 x float> %v0, %v1
@@ -362,74 +167,11 @@ define void @xor_sext_masks_v16i16(ptr %res, ptr %a, ptr %b) nounwind {
; CHECK-NEXT: xvld $xr0, $a1, 0
; CHECK-NEXT: xvld $xr1, $a2, 0
; CHECK-NEXT: xvseq.h $xr0, $xr0, $xr1
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 0
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 2
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 4
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 6
-; CHECK-NEXT: xvpermi.d $xr2, $xr0, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 0
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 8
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 2
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 10
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 4
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 12
-; CHECK-NEXT: vpickve2gr.h $a1, $vr2, 6
-; CHECK-NEXT: vinsgr2vr.b $vr1, $a1, 14
-; CHECK-NEXT: vpickve2gr.h $a1, $vr0, 7
-; CHECK-NEXT: vpickve2gr.h $a2, $vr0, 5
-; CHECK-NEXT: vpickve2gr.h $a3, $vr0, 3
-; CHECK-NEXT: vpickve2gr.h $a4, $vr0, 1
-; CHECK-NEXT: vpickve2gr.h $a5, $vr2, 7
-; CHECK-NEXT: vpickve2gr.h $a6, $vr2, 5
-; CHECK-NEXT: vpickve2gr.h $a7, $vr2, 3
-; CHECK-NEXT: vpickve2gr.h $t0, $vr2, 1
-; CHECK-NEXT: vxori.b $vr0, $vr1, 255
-; CHECK-NEXT: vpickve2gr.b $t1, $vr0, 8
-; CHECK-NEXT: ext.w.b $t1, $t1
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t1, 0
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 1
-; CHECK-NEXT: vpickve2gr.b $t0, $vr0, 10
-; CHECK-NEXT: ext.w.b $t0, $t0
-; CHECK-NEXT: vinsgr2vr.h $vr1, $t0, 2
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a7, 3
-; CHECK-NEXT: vpickve2gr.b $a7, $vr0, 12
-; CHECK-NEXT: ext.w.b $a7, $a7
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a7, 4
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a6, 5
-; CHECK-NEXT: vpickve2gr.b $a6, $vr0, 14
-; CHECK-NEXT: ext.w.b $a6, $a6
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a6, 6
-; CHECK-NEXT: ext.w.b $a5, $a5
-; CHECK-NEXT: vinsgr2vr.h $vr1, $a5, 7
-; CHECK-NEXT: vpickve2gr.b $a5, $vr0, 0
-; CHECK-NEXT: ext.w.b ...
[truncated]
|
On LASX the type v4i1/v8i1/v16i1 may be legalized to v4i32/v8i16/v16i8, which is LSX-sized register. In most cases we actually compare or select LASX-sized registers and mixing the two types creates horrible code.