diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d379a28ea5523..a1c8fade7648e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14815,9 +14815,10 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2); } - if (isTRNMask(ShuffleMask, NumElts, WhichResult)) { + if (isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; - return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2); + return DAG.getNode(Opc, DL, V1.getValueType(), OperandOrder == 0 ? V1 : V2, + OperandOrder == 0 ? V2 : V1); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult)) { @@ -16529,7 +16530,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { isREVMask(M, EltSize, NumElts, 16) || isEXTMask(M, VT, DummyBool, DummyUnsigned) || isSingletonEXTMask(M, VT, DummyUnsigned) || - isTRNMask(M, NumElts, DummyUnsigned) || + isTRNMask(M, NumElts, DummyUnsigned, DummyUnsigned) || isUZPMask(M, NumElts, DummyUnsigned) || isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || @@ -31588,10 +31589,13 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( OperandOrder == 0 ? Op1 : Op2, OperandOrder == 0 ? Op2 : Op1)); - if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) { + if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, + OperandOrder)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; - return convertFromScalableVector( - DAG, VT, DAG.getNode(Opc, DL, ContainerVT, Op1, Op2)); + SDValue TRN = + DAG.getNode(Opc, DL, ContainerVT, OperandOrder == 0 ? Op1 : Op2, + OperandOrder == 0 ? Op2 : Op1); + return convertFromScalableVector(DAG, VT, TRN); } if (isZIP_v_undef_Mask(ShuffleMask, VT, WhichResult) && WhichResult == 0) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index ef8786d0ad0e1..c7d6b31291197 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6699,33 +6699,53 @@ inline bool isUZPMask(ArrayRef M, unsigned NumElts, } /// Return true for trn1 or trn2 masks of the form: -/// <0, 8, 2, 10, 4, 12, 6, 14> or -/// <1, 9, 3, 11, 5, 13, 7, 15> +/// <0, 8, 2, 10, 4, 12, 6, 14> (WhichResultOut = 0, OperandOrderOut = 0) or +/// <1, 9, 3, 11, 5, 13, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or +/// <8, 0, 10, 2, 12, 4, 14, 6> (WhichResultOut = 0, OperandOrderOut = 1) or +/// <9, 1, 11, 3, 13, 5, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) or inline bool isTRNMask(ArrayRef M, unsigned NumElts, - unsigned &WhichResultOut) { + unsigned &WhichResultOut, unsigned &OperandOrderOut) { if (NumElts % 2 != 0) return false; - // Check the first non-undef element for trn1 vs trn2. - unsigned WhichResult = 2; + + // "Result" corresponds to "WhichResultOut", selecting between trn1 and trn2. + // "Order" corresponds to "OperandOrderOut", selecting the order of operands + // for the instruction (flipped or not). + bool Result0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0 + bool Result1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0 + bool Result0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1 + bool Result1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1 + // Check all elements match. for (unsigned i = 0; i != NumElts; i += 2) { if (M[i] >= 0) { - WhichResult = ((unsigned)M[i] == i ? 0 : 1); - break; + unsigned EvenElt = (unsigned)M[i]; + if (EvenElt != i) + Result0Order0 = false; + if (EvenElt != i + 1) + Result1Order0 = false; + if (EvenElt != NumElts + i) + Result0Order1 = false; + if (EvenElt != NumElts + i + 1) + Result1Order1 = false; } if (M[i + 1] >= 0) { - WhichResult = ((unsigned)M[i + 1] == i + NumElts ? 0 : 1); - break; + unsigned OddElt = (unsigned)M[i + 1]; + if (OddElt != NumElts + i) + Result0Order0 = false; + if (OddElt != NumElts + i + 1) + Result1Order0 = false; + if (OddElt != i) + Result0Order1 = false; + if (OddElt != i + 1) + Result1Order1 = false; } } - if (WhichResult == 2) + + if (Result0Order0 + Result1Order0 + Result0Order1 + Result1Order1 != 1) return false; - for (unsigned i = 0; i < NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != i + WhichResult) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != i + NumElts + WhichResult)) - return false; - } - WhichResultOut = WhichResult; + WhichResultOut = (Result0Order0 || Result0Order1) ? 0 : 1; + OperandOrderOut = (Result0Order0 || Result1Order0) ? 0 : 1; return true; } diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 4fba593b3d0fb..221a7bcd881bb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -215,14 +215,15 @@ bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI, ShuffleVectorPseudo &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); unsigned WhichResult; + unsigned OperandOrder; ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); Register Dst = MI.getOperand(0).getReg(); unsigned NumElts = MRI.getType(Dst).getNumElements(); - if (!isTRNMask(ShuffleMask, NumElts, WhichResult)) + if (!isTRNMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) return false; unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); + Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg(); + Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg(); MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); return true; } diff --git a/llvm/test/CodeGen/AArch64/arm64-trn.ll b/llvm/test/CodeGen/AArch64/arm64-trn.ll index fe245d01a7a6d..120c2d13a7ab7 100644 --- a/llvm/test/CodeGen/AArch64/arm64-trn.ll +++ b/llvm/test/CodeGen/AArch64/arm64-trn.ll @@ -246,6 +246,63 @@ define <4 x float> @vtrnQf(ptr %A, ptr %B) nounwind { ret <4 x float> %tmp5 } +define <8 x i8> @vtrni8_trn1_flipped(<8 x i8> %A, <8 x i8> %B) nounwind { +; CHECKLE-LABEL: vtrni8_trn1_flipped: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: trn1 v0.8b, v1.8b, v0.8b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni8_trn1_flipped: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: rev64 v1.8b, v1.8b +; CHECKBE-NEXT: trn1 v0.8b, v1.8b, v0.8b +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: ret + %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @vtrni8_trn2_flipped(<8 x i8> %A, <8 x i8> %B) nounwind { +; CHECKLE-LABEL: vtrni8_trn2_flipped: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni8_trn2_flipped: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: rev64 v1.8b, v1.8b +; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: ret + %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> + ret <8 x i8> %tmp1 +} + +define <8 x i8> @vtrni8_both_flipped_with_poison_values(<8 x i8> %A, <8 x i8> %B) nounwind { +; CHECKLE-LABEL: vtrni8_both_flipped_with_poison_values: +; CHECKLE: // %bb.0: +; CHECKLE-NEXT: trn1 v2.8b, v1.8b, v0.8b +; CHECKLE-NEXT: trn2 v0.8b, v1.8b, v0.8b +; CHECKLE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKLE-NEXT: ret +; +; CHECKBE-LABEL: vtrni8_both_flipped_with_poison_values: +; CHECKBE: // %bb.0: +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: rev64 v1.8b, v1.8b +; CHECKBE-NEXT: trn1 v2.8b, v1.8b, v0.8b +; CHECKBE-NEXT: trn2 v0.8b, v1.8b, v0.8b +; CHECKBE-NEXT: add v0.8b, v2.8b, v0.8b +; CHECKBE-NEXT: rev64 v0.8b, v0.8b +; CHECKBE-NEXT: ret + %tmp1 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> + %tmp2 = shufflevector <8 x i8> %A, <8 x i8> %B, <8 x i32> + %tmp3 = add <8 x i8> %tmp1, %tmp2 + ret <8 x i8> %tmp3 +} + ; Undef shuffle indices (even at the start of the shuffle mask) should not prevent matching to VTRN: define <8 x i8> @vtrni8_undef(ptr %A, ptr %B) nounwind { diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll index 282e0503dd7be..8e75d69be5062 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -6,12 +6,10 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec ; CHECK-SD-LABEL: vector_deinterleave_v2f16_v4f16: ; CHECK-SD: // %bb.0: ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: dup v2.2s, v0.s[1] -; CHECK-SD-NEXT: mov v1.16b, v2.16b -; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v2.4h -; CHECK-SD-NEXT: mov v1.h[0], v0.h[1] +; CHECK-SD-NEXT: dup v1.2s, v0.s[1] +; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v1.4h +; CHECK-SD-NEXT: trn2 v1.4h, v0.4h, v1.4h ; CHECK-SD-NEXT: fmov d0, d2 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: vector_deinterleave_v2f16_v4f16: diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index 072f6f4e8f73e..39beffcf85783 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -36,93 +36,93 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s ; CHECK-NEXT: trn1 v6.4s, v3.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s -; CHECK-NEXT: zip1 v18.4s, v2.4s, v1.4s +; CHECK-NEXT: ext v7.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s +; CHECK-NEXT: zip1 v17.4s, v2.4s, v1.4s +; CHECK-NEXT: trn2 v18.4s, v2.4s, v1.4s ; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s ; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8 -; CHECK-NEXT: mov v1.s[0], v2.s[1] -; CHECK-NEXT: ext v2.16b, v2.16b, v16.16b, #12 -; CHECK-NEXT: mov v17.d[1], v6.d[1] -; CHECK-NEXT: mov v7.d[1], v6.d[1] +; CHECK-NEXT: zip2 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: ext v2.16b, v2.16b, v7.16b, #12 +; CHECK-NEXT: mov v16.d[1], v6.d[1] +; CHECK-NEXT: mov v18.d[1], v5.d[1] ; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v18.d[1], v3.d[1] -; CHECK-NEXT: mov v1.d[1], v5.d[1] +; CHECK-NEXT: mov v17.d[1], v3.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: mov v2.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v4.4s, v17.4s -; CHECK-NEXT: add v3.4s, v1.4s, v18.4s -; CHECK-NEXT: sub v1.4s, v18.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v16.4s +; CHECK-NEXT: add v3.4s, v18.4s, v17.4s +; CHECK-NEXT: sub v6.4s, v17.4s, v18.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v3.4s -; CHECK-NEXT: sub v6.4s, v1.4s, v2.4s -; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v1.4s +; CHECK-NEXT: add v1.4s, v1.4s, v6.4s ; CHECK-NEXT: mov v4.d[1], v0.d[1] ; CHECK-NEXT: mov v5.d[1], v3.d[1] -; CHECK-NEXT: rev64 v2.4s, v6.4s +; CHECK-NEXT: rev64 v6.4s, v2.4s ; CHECK-NEXT: rev64 v7.4s, v1.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v4.4s ; CHECK-NEXT: add v0.4s, v0.4s, v5.4s -; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: sub v4.4s, v2.4s, v6.4s ; CHECK-NEXT: sub v5.4s, v1.4s, v7.4s -; CHECK-NEXT: addp v4.4s, v3.4s, v6.4s +; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s ; CHECK-NEXT: addp v1.4s, v0.4s, v1.4s ; CHECK-NEXT: rev64 v6.4s, v0.4s ; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: ext v16.16b, v4.16b, v2.16b, #4 +; CHECK-NEXT: ext v16.16b, v2.16b, v4.16b, #4 ; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #4 ; CHECK-NEXT: sub v0.4s, v0.4s, v6.4s ; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s -; CHECK-NEXT: mov v7.16b, v2.16b -; CHECK-NEXT: zip2 v6.4s, v16.4s, v4.4s +; CHECK-NEXT: mov v7.16b, v4.16b +; CHECK-NEXT: zip2 v6.4s, v16.4s, v2.4s ; CHECK-NEXT: mov v16.16b, v5.16b ; CHECK-NEXT: zip2 v17.4s, v17.4s, v1.4s ; CHECK-NEXT: ext v18.16b, v0.16b, v1.16b, #4 -; CHECK-NEXT: mov v7.s[2], v4.s[3] +; CHECK-NEXT: mov v7.s[2], v2.s[3] ; CHECK-NEXT: mov v21.16b, v3.16b ; CHECK-NEXT: mov v16.s[2], v1.s[3] ; CHECK-NEXT: ext v5.16b, v5.16b, v17.16b, #12 ; CHECK-NEXT: zip1 v17.4s, v1.4s, v1.4s -; CHECK-NEXT: ext v2.16b, v2.16b, v6.16b, #12 +; CHECK-NEXT: ext v4.16b, v4.16b, v6.16b, #12 ; CHECK-NEXT: ext v18.16b, v18.16b, v18.16b, #4 ; CHECK-NEXT: mov v19.16b, v7.16b -; CHECK-NEXT: ext v6.16b, v3.16b, v4.16b, #8 -; CHECK-NEXT: mov v21.s[2], v4.s[1] +; CHECK-NEXT: ext v6.16b, v3.16b, v2.16b, #8 +; CHECK-NEXT: mov v21.s[2], v2.s[1] ; CHECK-NEXT: mov v20.16b, v16.16b -; CHECK-NEXT: mov v19.s[1], v4.s[2] +; CHECK-NEXT: mov v19.s[1], v2.s[2] ; CHECK-NEXT: trn2 v0.4s, v17.4s, v0.4s ; CHECK-NEXT: sub v16.4s, v16.4s, v5.4s ; CHECK-NEXT: mov v17.16b, v18.16b ; CHECK-NEXT: ext v3.16b, v6.16b, v3.16b, #4 -; CHECK-NEXT: sub v7.4s, v7.4s, v2.4s +; CHECK-NEXT: sub v7.4s, v7.4s, v4.4s ; CHECK-NEXT: mov v20.s[1], v1.s[2] ; CHECK-NEXT: mov v17.s[0], v1.s[1] ; CHECK-NEXT: mov v1.16b, v21.16b -; CHECK-NEXT: add v2.4s, v19.4s, v2.4s -; CHECK-NEXT: uzp2 v3.4s, v6.4s, v3.4s +; CHECK-NEXT: add v4.4s, v19.4s, v4.4s ; CHECK-NEXT: add v5.4s, v20.4s, v5.4s -; CHECK-NEXT: mov v1.s[1], v4.s[0] -; CHECK-NEXT: sub v4.4s, v0.4s, v18.4s -; CHECK-NEXT: mov v2.d[1], v7.d[1] +; CHECK-NEXT: mov v1.s[1], v2.s[0] +; CHECK-NEXT: uzp2 v2.4s, v6.4s, v3.4s +; CHECK-NEXT: sub v3.4s, v0.4s, v18.4s ; CHECK-NEXT: add v0.4s, v0.4s, v17.4s +; CHECK-NEXT: mov v4.d[1], v7.d[1] ; CHECK-NEXT: mov v5.d[1], v16.d[1] -; CHECK-NEXT: sub v6.4s, v21.4s, v3.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: cmlt v4.8h, v2.8h, #0 -; CHECK-NEXT: cmlt v3.8h, v5.8h, #0 +; CHECK-NEXT: sub v6.4s, v21.4s, v2.4s +; CHECK-NEXT: add v1.4s, v1.4s, v2.4s +; CHECK-NEXT: mov v0.d[1], v3.d[1] +; CHECK-NEXT: cmlt v3.8h, v4.8h, #0 +; CHECK-NEXT: cmlt v2.8h, v5.8h, #0 ; CHECK-NEXT: mov v1.d[1], v6.d[1] -; CHECK-NEXT: add v2.4s, v4.4s, v2.4s ; CHECK-NEXT: cmlt v6.8h, v0.8h, #0 -; CHECK-NEXT: add v5.4s, v3.4s, v5.4s -; CHECK-NEXT: eor v2.16b, v2.16b, v4.16b +; CHECK-NEXT: add v4.4s, v3.4s, v4.4s +; CHECK-NEXT: add v5.4s, v2.4s, v5.4s ; CHECK-NEXT: cmlt v7.8h, v1.8h, #0 ; CHECK-NEXT: add v0.4s, v6.4s, v0.4s -; CHECK-NEXT: eor v3.16b, v5.16b, v3.16b +; CHECK-NEXT: eor v3.16b, v4.16b, v3.16b +; CHECK-NEXT: eor v2.16b, v5.16b, v2.16b ; CHECK-NEXT: add v1.4s, v7.4s, v1.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v6.16b -; CHECK-NEXT: add v2.4s, v3.4s, v2.4s +; CHECK-NEXT: add v2.4s, v2.4s, v3.4s ; CHECK-NEXT: eor v1.16b, v1.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v2.4s ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s @@ -255,77 +255,76 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h ; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h ; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s -; CHECK-NEXT: trn1 v18.4s, v1.4s, v0.4s +; CHECK-NEXT: trn1 v6.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s ; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v7.16b, v3.16b -; CHECK-NEXT: zip1 v6.4s, v2.4s, v3.4s -; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s +; CHECK-NEXT: zip1 v7.4s, v2.4s, v3.4s +; CHECK-NEXT: trn2 v16.4s, v2.4s, v3.4s +; CHECK-NEXT: ext v18.16b, v3.16b, v3.16b, #12 ; CHECK-NEXT: ext v17.16b, v1.16b, v4.16b, #8 -; CHECK-NEXT: mov v7.s[0], v2.s[1] -; CHECK-NEXT: ext v1.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: zip2 v1.4s, v3.4s, v2.4s ; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s +; CHECK-NEXT: mov v16.d[1], v4.d[1] ; CHECK-NEXT: zip2 v3.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v16.d[1], v18.d[1] -; CHECK-NEXT: mov v6.d[1], v17.d[1] -; CHECK-NEXT: mov v7.d[1], v4.d[1] -; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; CHECK-NEXT: ext v2.16b, v2.16b, v18.16b, #12 +; CHECK-NEXT: mov v7.d[1], v17.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] ; CHECK-NEXT: mov v5.d[1], v0.d[1] -; CHECK-NEXT: mov v3.d[1], v18.d[1] -; CHECK-NEXT: add v2.4s, v7.4s, v6.4s -; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: add v4.4s, v5.4s, v16.4s -; CHECK-NEXT: rev64 v5.4s, v2.4s -; CHECK-NEXT: rev64 v0.4s, v4.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v7.4s -; CHECK-NEXT: mov v5.d[1], v2.d[1] -; CHECK-NEXT: add v6.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s -; CHECK-NEXT: mov v0.d[1], v4.d[1] -; CHECK-NEXT: add v4.4s, v4.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s -; CHECK-NEXT: zip1 v2.4s, v4.4s, v6.4s -; CHECK-NEXT: uzp2 v3.4s, v4.4s, v6.4s -; CHECK-NEXT: zip2 v16.4s, v4.4s, v6.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s -; CHECK-NEXT: trn1 v7.4s, v0.4s, v1.4s -; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s -; CHECK-NEXT: trn2 v2.4s, v4.4s, v2.4s -; CHECK-NEXT: uzp2 v3.4s, v3.4s, v4.4s -; CHECK-NEXT: mov v4.s[1], v6.s[1] -; CHECK-NEXT: ext v0.16b, v0.16b, v5.16b, #8 -; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v3.d[1], v1.d[1] -; CHECK-NEXT: mov v4.d[1], v5.d[1] +; CHECK-NEXT: mov v3.d[1], v6.d[1] ; CHECK-NEXT: mov v2.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v16.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s -; CHECK-NEXT: add v1.4s, v4.4s, v2.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: add v4.4s, v16.4s, v7.4s +; CHECK-NEXT: sub v6.4s, v7.4s, v16.4s +; CHECK-NEXT: add v1.4s, v5.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s +; CHECK-NEXT: rev64 v5.4s, v4.4s +; CHECK-NEXT: rev64 v0.4s, v1.4s +; CHECK-NEXT: add v3.4s, v2.4s, v6.4s +; CHECK-NEXT: sub v2.4s, v6.4s, v2.4s +; CHECK-NEXT: mov v5.d[1], v4.d[1] +; CHECK-NEXT: mov v0.d[1], v1.d[1] +; CHECK-NEXT: add v1.4s, v1.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s +; CHECK-NEXT: zip1 v4.4s, v1.4s, v3.4s +; CHECK-NEXT: uzp2 v5.4s, v1.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v1.4s, v3.4s +; CHECK-NEXT: zip1 v6.4s, v0.4s, v2.4s +; CHECK-NEXT: trn1 v16.4s, v0.4s, v2.4s +; CHECK-NEXT: zip2 v2.4s, v0.4s, v2.4s +; CHECK-NEXT: trn2 v4.4s, v1.4s, v4.4s +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v1.4s +; CHECK-NEXT: mov v1.s[1], v3.s[1] +; CHECK-NEXT: ext v0.16b, v0.16b, v6.16b, #8 +; CHECK-NEXT: mov v7.d[1], v16.d[1] +; CHECK-NEXT: mov v5.d[1], v2.d[1] +; CHECK-NEXT: mov v1.d[1], v6.d[1] +; CHECK-NEXT: mov v4.d[1], v0.d[1] +; CHECK-NEXT: add v0.4s, v7.4s, v5.4s +; CHECK-NEXT: sub v3.4s, v5.4s, v7.4s +; CHECK-NEXT: add v2.4s, v1.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v4.4s, v1.4s ; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4 ; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s ; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 -; CHECK-NEXT: zip2 v16.4s, v2.4s, v1.4s -; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 +; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v17.4s, v2.4s, v1.4s ; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s -; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8 -; CHECK-NEXT: ext v19.16b, v5.16b, v2.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v1.16b, #8 +; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s ; CHECK-NEXT: add v2.4s, v16.4s, v7.4s ; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s ; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 -; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 ; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s ; CHECK-NEXT: add v3.4s, v1.4s, v3.4s ; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 -; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b ; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b ; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s @@ -480,7 +479,7 @@ define i32 @v3(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: sub v3.4s, v3.4s, v7.4s ; CHECK-NEXT: uzp2 v4.4s, v1.4s, v0.4s ; CHECK-NEXT: uzp1 v7.4s, v1.4s, v0.4s -; CHECK-NEXT: mov v6.s[3], v5.s[2] +; CHECK-NEXT: trn1 v6.4s, v6.4s, v5.4s ; CHECK-NEXT: ext v0.16b, v0.16b, v0.16b, #8 ; CHECK-NEXT: zip2 v17.4s, v2.4s, v3.4s ; CHECK-NEXT: zip1 v2.4s, v2.4s, v3.4s