From 09806bfcb259fe4d61813517aa27414c84196b6e Mon Sep 17 00:00:00 2001 From: Philip Ginsbach-Chen Date: Sun, 9 Nov 2025 14:58:52 +0000 Subject: [PATCH 1/4] [AArch64] recognise zip1/zip2 with flipped operands --- .../Target/AArch64/AArch64ISelLowering.cpp | 26 ++- .../Target/AArch64/AArch64PerfectShuffle.h | 57 +++--- .../AArch64/AArch64TargetTransformInfo.cpp | 2 +- .../GISel/AArch64PostLegalizerLowering.cpp | 3 +- llvm/test/CodeGen/AArch64/arm64-zip.ll | 34 ++-- .../AArch64/fixed-vector-deinterleave.ll | 4 +- llvm/test/CodeGen/AArch64/insert-extend.ll | 72 ++++---- llvm/test/CodeGen/AArch64/insert-subvector.ll | 37 ++-- .../CodeGen/AArch64/neon-widen-shuffle.ll | 5 +- llvm/test/CodeGen/AArch64/reduce-shuffle.ll | 166 +++++++++--------- 10 files changed, 203 insertions(+), 203 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8f41f230b5521..df92d307fb033 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14722,9 +14722,12 @@ SDValue AArch64TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, } unsigned WhichResult; - if (isZIPMask(ShuffleMask, NumElts, WhichResult)) { + unsigned OperandOrder; + if (isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::ZIP1 : AArch64ISD::ZIP2; - return DAG.getNode(Opc, DL, V1.getValueType(), V1, V2); + return DAG.getNode(Opc, DL, V1.getValueType(), + (OperandOrder == 0) ? V1 : V2, + (OperandOrder == 0) ? V2 : V1); } if (isUZPMask(ShuffleMask, NumElts, WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; @@ -16446,7 +16449,7 @@ bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { isSingletonEXTMask(M, VT, DummyUnsigned) || isTRNMask(M, NumElts, DummyUnsigned) || isUZPMask(M, NumElts, DummyUnsigned) || - isZIPMask(M, NumElts, DummyUnsigned) || + isZIPMask(M, NumElts, DummyUnsigned, DummyUnsigned) || isTRN_v_undef_Mask(M, VT, DummyUnsigned) || isUZP_v_undef_Mask(M, VT, DummyUnsigned) || isZIP_v_undef_Mask(M, VT, DummyUnsigned) || @@ -31440,10 +31443,15 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( } unsigned WhichResult; - if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) && + unsigned OperandOrder; + if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, + OperandOrder) && WhichResult == 0) return convertFromScalableVector( - DAG, VT, DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, Op1, Op2)); + DAG, VT, + DAG.getNode(AArch64ISD::ZIP1, DL, ContainerVT, + OperandOrder == 0 ? Op1 : Op2, + OperandOrder == 0 ? Op2 : Op1)); if (isTRNMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::TRN1 : AArch64ISD::TRN2; @@ -31488,10 +31496,14 @@ SDValue AArch64TargetLowering::LowerFixedLengthVECTOR_SHUFFLEToSVE( return convertFromScalableVector(DAG, VT, Op); } - if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult) && + if (isZIPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult, + OperandOrder) && WhichResult != 0) return convertFromScalableVector( - DAG, VT, DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, Op1, Op2)); + DAG, VT, + DAG.getNode(AArch64ISD::ZIP2, DL, ContainerVT, + OperandOrder == 0 ? Op1 : Op2, + OperandOrder == 0 ? Op2 : Op1)); if (isUZPMask(ShuffleMask, VT.getVectorNumElements(), WhichResult)) { unsigned Opc = (WhichResult == 0) ? AArch64ISD::UZP1 : AArch64ISD::UZP2; diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index c28cbf2bc63c2..4b8cc61cc7d33 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6623,34 +6623,49 @@ inline unsigned getPerfectShuffleCost(llvm::ArrayRef M) { /// Return true for zip1 or zip2 masks of the form: /// <0, 8, 1, 9, 2, 10, 3, 11> or -/// <4, 12, 5, 13, 6, 14, 7, 15> +/// <4, 12, 5, 13, 6, 14, 7, 15> or +/// <8, 0, 9, 1, 10, 2, 11, 3> or +/// <12, 4, 13, 5, 14, 6, 15, 7> inline bool isZIPMask(ArrayRef M, unsigned NumElts, - unsigned &WhichResultOut) { + unsigned &WhichResultOut, unsigned &OperandOrderOut) { if (NumElts % 2 != 0) return false; - // Check the first non-undef element for which half to use. - unsigned WhichResult = 2; - for (unsigned i = 0; i != NumElts / 2; i++) { - if (M[i * 2] >= 0) { - WhichResult = ((unsigned)M[i * 2] == i ? 0 : 1); - break; - } else if (M[i * 2 + 1] >= 0) { - WhichResult = ((unsigned)M[i * 2 + 1] == NumElts + i ? 0 : 1); - break; - } - } - if (WhichResult == 2) - return false; + // "Variant" refers to the distinction bwetween zip1 and zip2, while + // "Order" refers to sequence of input registers (matching vs flipped). + bool Variant0Order0 = true; + bool Variant1Order0 = true; + bool Variant0Order1 = true; + bool Variant1Order1 = true; // Check all elements match. - unsigned Idx = WhichResult * NumElts / 2; for (unsigned i = 0; i != NumElts; i += 2) { - if ((M[i] >= 0 && (unsigned)M[i] != Idx) || - (M[i + 1] >= 0 && (unsigned)M[i + 1] != Idx + NumElts)) - return false; - Idx += 1; + if (M[i] >= 0) { + if ((unsigned)M[i] != i / 2) + Variant0Order0 = false; + if ((unsigned)M[i] != NumElts / 2 + i / 2) + Variant1Order0 = false; + if ((unsigned)M[i] != NumElts + i / 2) + Variant0Order1 = false; + if ((unsigned)M[i] != NumElts + NumElts / 2 + i / 2) + Variant1Order1 = false; + } + if (M[i + 1] >= 0) { + if ((unsigned)M[i + 1] != NumElts + i / 2) + Variant0Order0 = false; + if ((unsigned)M[i + 1] != NumElts + NumElts / 2 + i / 2) + Variant1Order0 = false; + if ((unsigned)M[i + 1] != i / 2) + Variant0Order1 = false; + if ((unsigned)M[i + 1] != NumElts / 2 + i / 2) + Variant1Order1 = false; + } } - WhichResultOut = WhichResult; + + if (Variant0Order0 + Variant1Order0 + Variant0Order1 + Variant1Order1 != 1) + return false; + + WhichResultOut = (Variant0Order0 || Variant0Order1) ? 0 : 1; + OperandOrderOut = (Variant0Order0 || Variant1Order0) ? 0 : 1; return true; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0bae00bafee3c..c7fb8e3bcb70e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -6062,7 +6062,7 @@ AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, if (LT.second.isFixedLengthVector() && LT.second.getVectorNumElements() == Mask.size() && (Kind == TTI::SK_PermuteTwoSrc || Kind == TTI::SK_PermuteSingleSrc) && - (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused) || + (isZIPMask(Mask, LT.second.getVectorNumElements(), Unused, Unused) || isUZPMask(Mask, LT.second.getVectorNumElements(), Unused) || isREVMask(Mask, LT.second.getScalarSizeInBits(), LT.second.getVectorNumElements(), 16) || diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index 23dcaea2ac1a4..b1945dc76f269 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -252,10 +252,11 @@ bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, ShuffleVectorPseudo &MatchInfo) { assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR); unsigned WhichResult; + unsigned OperandOrder; ArrayRef ShuffleMask = MI.getOperand(3).getShuffleMask(); Register Dst = MI.getOperand(0).getReg(); unsigned NumElts = MRI.getType(Dst).getNumElements(); - if (!isZIPMask(ShuffleMask, NumElts, WhichResult)) + if (!isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) return false; unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; Register V1 = MI.getOperand(1).getReg(); diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll index 9b06620590cda..c1d4a317cdf3f 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zip.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll @@ -355,48 +355,38 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) { ret <8 x i16> %3 } -; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) { ; CHECK-SD-LABEL: combine_v8i16_8first: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1_q2 -; CHECK-SD-NEXT: adrp x8, .LCPI25_0 -; CHECK-SD-NEXT: fmov d2, d0 -; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI25_0] -; CHECK-SD-NEXT: tbl.16b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_v8i16_8first: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q31_q0 -; CHECK-GI-NEXT: adrp x8, .LCPI25_0 -; CHECK-GI-NEXT: fmov d31, d1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI25_0] -; CHECK-GI-NEXT: tbl.16b v0, { v31, v0 }, v2 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: zip1.16b v0, v1, v0 ; CHECK-GI-NEXT: ret %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> ret <16 x i8> %3 } -; FIXME: This could be zip1 too, 8,0,9,1... pattern is handled define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) { ; CHECK-SD-LABEL: combine_v8i16_8firstundef: ; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1_q2 -; CHECK-SD-NEXT: adrp x8, .LCPI26_0 -; CHECK-SD-NEXT: fmov d2, d0 -; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI26_0] -; CHECK-SD-NEXT: tbl.16b v0, { v1, v2 }, v3 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-SD-NEXT: zip1.16b v0, v0, v1 ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: combine_v8i16_8firstundef: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q31_q0 -; CHECK-GI-NEXT: adrp x8, .LCPI26_0 -; CHECK-GI-NEXT: fmov d31, d1 -; CHECK-GI-NEXT: ldr q2, [x8, :lo12:.LCPI26_0] -; CHECK-GI-NEXT: tbl.16b v0, { v31, v0 }, v2 +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: zip1.16b v0, v1, v0 ; CHECK-GI-NEXT: ret %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> ret <16 x i8> %3 diff --git a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll index 4ab5db450a7f3..282e0503dd7be 100644 --- a/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll +++ b/llvm/test/CodeGen/AArch64/fixed-vector-deinterleave.ll @@ -8,9 +8,9 @@ define {<2 x half>, <2 x half>} @vector_deinterleave_v2f16_v4f16(<4 x half> %vec ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-SD-NEXT: dup v2.2s, v0.s[1] ; CHECK-SD-NEXT: mov v1.16b, v2.16b +; CHECK-SD-NEXT: zip1 v2.4h, v0.4h, v2.4h ; CHECK-SD-NEXT: mov v1.h[0], v0.h[1] -; CHECK-SD-NEXT: mov v0.h[1], v2.h[0] -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: fmov d0, d2 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1 ; CHECK-SD-NEXT: ret ; diff --git a/llvm/test/CodeGen/AArch64/insert-extend.ll b/llvm/test/CodeGen/AArch64/insert-extend.ll index e128abf4d7376..1f2bacea5edef 100644 --- a/llvm/test/CodeGen/AArch64/insert-extend.ll +++ b/llvm/test/CodeGen/AArch64/insert-extend.ll @@ -66,57 +66,57 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca ; CHECK-NEXT: ldr d5, [x11, x9] ; CHECK-NEXT: shll2 v6.4s, v0.8h, #16 ; CHECK-NEXT: usubl v2.8h, v2.8b, v3.8b +; CHECK-NEXT: shll2 v7.4s, v1.8h, #16 ; CHECK-NEXT: usubl v3.8h, v4.8b, v5.8b -; CHECK-NEXT: shll2 v4.4s, v1.8h, #16 ; CHECK-NEXT: saddw v0.4s, v6.4s, v0.4h -; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 +; CHECK-NEXT: shll2 v4.4s, v2.8h, #16 +; CHECK-NEXT: saddw v1.4s, v7.4s, v1.4h ; CHECK-NEXT: shll2 v5.4s, v3.8h, #16 -; CHECK-NEXT: saddw v1.4s, v4.4s, v1.4h -; CHECK-NEXT: rev64 v4.4s, v0.4s -; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h +; CHECK-NEXT: rev64 v6.4s, v0.4s +; CHECK-NEXT: saddw v2.4s, v4.4s, v2.4h +; CHECK-NEXT: rev64 v7.4s, v1.4s ; CHECK-NEXT: saddw v3.4s, v5.4s, v3.4h -; CHECK-NEXT: rev64 v5.4s, v1.4s -; CHECK-NEXT: rev64 v6.4s, v2.4s -; CHECK-NEXT: sub v4.4s, v0.4s, v4.4s +; CHECK-NEXT: rev64 v4.4s, v2.4s +; CHECK-NEXT: sub v6.4s, v0.4s, v6.4s ; CHECK-NEXT: addp v0.4s, v1.4s, v0.4s -; CHECK-NEXT: rev64 v7.4s, v3.4s -; CHECK-NEXT: sub v5.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v6.4s, v2.4s, v6.4s +; CHECK-NEXT: rev64 v5.4s, v3.4s +; CHECK-NEXT: sub v7.4s, v1.4s, v7.4s +; CHECK-NEXT: sub v4.4s, v2.4s, v4.4s ; CHECK-NEXT: addp v2.4s, v3.4s, v2.4s -; CHECK-NEXT: zip1 v16.4s, v5.4s, v4.4s -; CHECK-NEXT: sub v7.4s, v3.4s, v7.4s -; CHECK-NEXT: trn1 v4.4s, v5.4s, v4.4s -; CHECK-NEXT: zip2 v3.4s, v6.4s, v7.4s -; CHECK-NEXT: mov v6.s[1], v7.s[0] +; CHECK-NEXT: zip1 v16.4s, v7.4s, v6.4s +; CHECK-NEXT: sub v5.4s, v3.4s, v5.4s +; CHECK-NEXT: trn1 v3.4s, v7.4s, v6.4s +; CHECK-NEXT: zip1 v6.4s, v4.4s, v5.4s +; CHECK-NEXT: zip2 v4.4s, v4.4s, v5.4s +; CHECK-NEXT: ext v5.16b, v7.16b, v16.16b, #8 ; CHECK-NEXT: ext v7.16b, v2.16b, v2.16b, #8 -; CHECK-NEXT: ext v5.16b, v5.16b, v16.16b, #8 -; CHECK-NEXT: mov v3.d[1], v4.d[1] -; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s -; CHECK-NEXT: uzp2 v4.4s, v7.4s, v0.4s +; CHECK-NEXT: mov v4.d[1], v3.d[1] ; CHECK-NEXT: mov v6.d[1], v5.d[1] +; CHECK-NEXT: uzp1 v1.4s, v7.4s, v0.4s +; CHECK-NEXT: uzp2 v3.4s, v7.4s, v0.4s ; CHECK-NEXT: addp v0.4s, v2.4s, v0.4s -; CHECK-NEXT: sub v1.4s, v1.4s, v4.4s +; CHECK-NEXT: add v5.4s, v4.4s, v6.4s +; CHECK-NEXT: sub v4.4s, v6.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v1.4s, v3.4s ; CHECK-NEXT: rev64 v7.4s, v0.4s -; CHECK-NEXT: add v5.4s, v3.4s, v6.4s -; CHECK-NEXT: sub v3.4s, v6.4s, v3.4s +; CHECK-NEXT: rev64 v3.4s, v5.4s +; CHECK-NEXT: rev64 v6.4s, v4.4s ; CHECK-NEXT: rev64 v2.4s, v1.4s -; CHECK-NEXT: rev64 v4.4s, v5.4s -; CHECK-NEXT: rev64 v6.4s, v3.4s ; CHECK-NEXT: addp v16.4s, v0.4s, v5.4s ; CHECK-NEXT: sub v0.4s, v0.4s, v7.4s -; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s -; CHECK-NEXT: sub v4.4s, v5.4s, v4.4s -; CHECK-NEXT: addp v5.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v3.4s, v6.4s +; CHECK-NEXT: sub v3.4s, v5.4s, v3.4s +; CHECK-NEXT: addp v5.4s, v1.4s, v4.4s +; CHECK-NEXT: sub v4.4s, v4.4s, v6.4s ; CHECK-NEXT: sub v1.4s, v1.4s, v2.4s ; CHECK-NEXT: ext v7.16b, v0.16b, v16.16b, #4 -; CHECK-NEXT: ext v2.16b, v16.16b, v4.16b, #4 -; CHECK-NEXT: ext v6.16b, v5.16b, v3.16b, #4 -; CHECK-NEXT: mov v19.16b, v4.16b +; CHECK-NEXT: zip1 v21.4s, v16.4s, v16.4s +; CHECK-NEXT: ext v2.16b, v16.16b, v3.16b, #4 +; CHECK-NEXT: ext v6.16b, v5.16b, v4.16b, #4 +; CHECK-NEXT: mov v19.16b, v3.16b ; CHECK-NEXT: ext v17.16b, v1.16b, v5.16b, #8 -; CHECK-NEXT: mov v20.16b, v3.16b -; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s +; CHECK-NEXT: mov v20.16b, v4.16b ; CHECK-NEXT: ext v7.16b, v7.16b, v7.16b, #4 +; CHECK-NEXT: trn2 v0.4s, v21.4s, v0.4s ; CHECK-NEXT: mov v19.s[2], v16.s[3] ; CHECK-NEXT: zip2 v2.4s, v2.4s, v16.4s ; CHECK-NEXT: zip2 v6.4s, v6.4s, v5.4s @@ -125,8 +125,8 @@ define i32 @large(ptr nocapture noundef readonly %p1, i32 noundef %st1, ptr noca ; CHECK-NEXT: mov v1.s[2], v5.s[1] ; CHECK-NEXT: mov v21.16b, v7.16b ; CHECK-NEXT: sub v7.4s, v0.4s, v7.4s -; CHECK-NEXT: ext v2.16b, v4.16b, v2.16b, #12 -; CHECK-NEXT: ext v3.16b, v3.16b, v6.16b, #12 +; CHECK-NEXT: ext v2.16b, v3.16b, v2.16b, #12 +; CHECK-NEXT: ext v3.16b, v4.16b, v6.16b, #12 ; CHECK-NEXT: uzp2 v4.4s, v17.4s, v18.4s ; CHECK-NEXT: mov v6.16b, v1.16b ; CHECK-NEXT: mov v17.16b, v19.16b diff --git a/llvm/test/CodeGen/AArch64/insert-subvector.ll b/llvm/test/CodeGen/AArch64/insert-subvector.ll index 6828fa9f1508c..88b6ea4f0cb19 100644 --- a/llvm/test/CodeGen/AArch64/insert-subvector.ll +++ b/llvm/test/CodeGen/AArch64/insert-subvector.ll @@ -102,10 +102,7 @@ define <8 x i8> @insert_v8i8_4_1(float %tmp, <8 x i8> %b, <8 x i8> %a) { define <8 x i8> @insert_v8i8_4_2(float %tmp, <8 x i8> %b, <8 x i8> %a) { ; CHECK-LABEL: insert_v8i8_4_2: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip1 v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %s2 = shufflevector <8 x i8> %a, <8 x i8> %b, <8 x i32> ret <8 x i8> %s2 @@ -124,8 +121,7 @@ define <16 x i8> @insert_v16i8_8_1(float %tmp, <16 x i8> %b, <16 x i8> %a) { define <16 x i8> @insert_v16i8_8_2(float %tmp, <16 x i8> %b, <16 x i8> %a) { ; CHECK-LABEL: insert_v16i8_8_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d ; CHECK-NEXT: ret %s2 = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> ret <16 x i8> %s2 @@ -201,10 +197,7 @@ define <4 x i16> @insert_v4i16_2_1(float %tmp, <4 x i16> %b, <4 x i16> %a) { define <4 x i16> @insert_v4i16_2_2(float %tmp, <4 x i16> %b, <4 x i16> %a) { ; CHECK-LABEL: insert_v4i16_2_2: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip1 v0.2s, v1.2s, v2.2s ; CHECK-NEXT: ret %s2 = shufflevector <4 x i16> %a, <4 x i16> %b, <4 x i32> ret <4 x i16> %s2 @@ -223,8 +216,7 @@ define <8 x i16> @insert_v8i16_4_1(float %tmp, <8 x i16> %b, <8 x i16> %a) { define <8 x i16> @insert_v8i16_4_2(float %tmp, <8 x i16> %b, <8 x i16> %a) { ; CHECK-LABEL: insert_v8i16_4_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d ; CHECK-NEXT: ret %s2 = shufflevector <8 x i16> %a, <8 x i16> %b, <8 x i32> ret <8 x i16> %s2 @@ -245,8 +237,7 @@ define <4 x i32> @insert_v4i32_2_1(float %tmp, <4 x i32> %b, <4 x i32> %a) { define <4 x i32> @insert_v4i32_2_2(float %tmp, <4 x i32> %b, <4 x i32> %a) { ; CHECK-LABEL: insert_v4i32_2_2: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v0.16b, v1.16b -; CHECK-NEXT: mov v0.d[1], v2.d[0] +; CHECK-NEXT: zip1 v0.2d, v1.2d, v2.2d ; CHECK-NEXT: ret %s2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> ret <4 x i32> %s2 @@ -337,10 +328,8 @@ define <8 x i8> @load_v8i8_4_1(float %tmp, <8 x i8> %b, ptr %a) { define <8 x i8> @load_v8i8_4_2(float %tmp, <8 x i8> %b, ptr %a) { ; CHECK-LABEL: load_v8i8_4_2: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: ldr s2, [x0] -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %l = load <4 x i8>, ptr %a %s1 = shufflevector <4 x i8> %l, <4 x i8> poison, <8 x i32> @@ -465,10 +454,8 @@ define <4 x i8> @load_v4i8_2_2(float %tmp, <4 x i8> %b, ptr %a) { ; CHECK-LABEL: load_v4i8_2_2: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr h0, [x0] -; CHECK-NEXT: zip1 v2.8b, v0.8b, v0.8b -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: zip1 v0.8b, v0.8b, v0.8b +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %l = load <2 x i8>, ptr %a %s1 = shufflevector <2 x i8> %l, <2 x i8> poison, <4 x i32> @@ -558,10 +545,8 @@ define <4 x i16> @load_v4i16_2_1(float %tmp, <4 x i16> %b, ptr %a) { define <4 x i16> @load_v4i16_2_2(float %tmp, <4 x i16> %b, ptr %a) { ; CHECK-LABEL: load_v4i16_2_2: ; CHECK: // %bb.0: -; CHECK-NEXT: fmov d0, d1 -; CHECK-NEXT: ldr s2, [x0] -; CHECK-NEXT: mov v0.s[1], v2.s[0] -; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-NEXT: ldr s0, [x0] +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret %l = load <2 x i16>, ptr %a %s1 = shufflevector <2 x i16> %l, <2 x i16> poison, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll index afcced5dcb9ab..b05e5773cdbd1 100644 --- a/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/neon-widen-shuffle.ll @@ -24,7 +24,7 @@ entry: define <4 x i32> @shuffle3(<4 x i32> %a, <4 x i32> %b) { ; CHECK-LABEL: shuffle3: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v0.d[0], v1.d[1] +; CHECK-NEXT: zip2 v0.2d, v1.2d, v0.2d ; CHECK-NEXT: ret entry: %res = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> @@ -113,8 +113,7 @@ define <8 x i16> @shuffle10(<8 x i16> %a) { define <4 x i16> @shuffle11(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: shuffle11: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: mov v1.s[1], v0.s[0] -; CHECK-NEXT: fmov d0, d1 +; CHECK-NEXT: zip1 v0.2s, v1.2s, v0.2s ; CHECK-NEXT: ret entry: %res = shufflevector <8 x i16> %a, <8 x i16> %b, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll index 354edc4ff7ab4..072f6f4e8f73e 100644 --- a/llvm/test/CodeGen/AArch64/reduce-shuffle.ll +++ b/llvm/test/CodeGen/AArch64/reduce-shuffle.ll @@ -34,27 +34,26 @@ define i32 @v1(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h ; CHECK-NEXT: uzp2 v4.4s, v1.4s, v2.4s ; CHECK-NEXT: zip1 v5.4s, v3.4s, v0.4s -; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: trn1 v7.4s, v3.4s, v0.4s +; CHECK-NEXT: trn1 v6.4s, v3.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v3.4s, v0.4s -; CHECK-NEXT: ext v17.16b, v1.16b, v1.16b, #12 -; CHECK-NEXT: zip2 v18.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v16.4s, v2.4s, v1.4s -; CHECK-NEXT: mov v6.s[1], v1.s[0] +; CHECK-NEXT: ext v16.16b, v1.16b, v1.16b, #12 +; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s +; CHECK-NEXT: zip2 v7.4s, v2.4s, v1.4s +; CHECK-NEXT: zip1 v18.4s, v2.4s, v1.4s ; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s ; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #8 ; CHECK-NEXT: mov v1.s[0], v2.s[1] -; CHECK-NEXT: ext v2.16b, v2.16b, v17.16b, #12 -; CHECK-NEXT: mov v18.d[1], v7.d[1] -; CHECK-NEXT: mov v16.d[1], v7.d[1] +; CHECK-NEXT: ext v2.16b, v2.16b, v16.16b, #12 +; CHECK-NEXT: mov v17.d[1], v6.d[1] +; CHECK-NEXT: mov v7.d[1], v6.d[1] ; CHECK-NEXT: mov v4.d[1], v0.d[1] -; CHECK-NEXT: mov v6.d[1], v3.d[1] +; CHECK-NEXT: mov v18.d[1], v3.d[1] ; CHECK-NEXT: mov v1.d[1], v5.d[1] ; CHECK-NEXT: mov v2.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v4.4s, v18.4s -; CHECK-NEXT: add v3.4s, v1.4s, v6.4s -; CHECK-NEXT: sub v1.4s, v6.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v16.4s, v2.4s +; CHECK-NEXT: add v0.4s, v4.4s, v17.4s +; CHECK-NEXT: add v3.4s, v1.4s, v18.4s +; CHECK-NEXT: sub v1.4s, v18.4s, v1.4s +; CHECK-NEXT: sub v2.4s, v7.4s, v2.4s ; CHECK-NEXT: rev64 v4.4s, v0.4s ; CHECK-NEXT: rev64 v5.4s, v3.4s ; CHECK-NEXT: sub v6.4s, v1.4s, v2.4s @@ -239,99 +238,98 @@ define i32 @v2(ptr nocapture noundef readonly %p1, i32 noundef %i1, ptr nocaptur ; CHECK-NEXT: add x10, x10, x8 ; CHECK-NEXT: ldr d3, [x11] ; CHECK-NEXT: add x11, x11, x9 -; CHECK-NEXT: ldr d4, [x10, x8] -; CHECK-NEXT: ldr d6, [x10] -; CHECK-NEXT: ldr d5, [x11, x9] -; CHECK-NEXT: ldr d7, [x11] +; CHECK-NEXT: ldr d4, [x10] +; CHECK-NEXT: ldr d6, [x10, x8] +; CHECK-NEXT: ldr d5, [x11] +; CHECK-NEXT: ldr d7, [x11, x9] ; CHECK-NEXT: usubl v0.8h, v0.8b, v1.8b ; CHECK-NEXT: usubl v1.8h, v2.8b, v3.8b ; CHECK-NEXT: usubl v2.8h, v4.8b, v5.8b ; CHECK-NEXT: usubl v3.8h, v6.8b, v7.8b ; CHECK-NEXT: shll2 v4.4s, v0.8h, #16 ; CHECK-NEXT: shll2 v5.4s, v1.8h, #16 -; CHECK-NEXT: shll2 v6.4s, v2.8h, #16 -; CHECK-NEXT: shll2 v7.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v6.4s, v3.8h, #16 +; CHECK-NEXT: shll2 v7.4s, v2.8h, #16 ; CHECK-NEXT: saddw v0.4s, v4.4s, v0.4h ; CHECK-NEXT: saddw v1.4s, v5.4s, v1.4h -; CHECK-NEXT: saddw v2.4s, v6.4s, v2.4h -; CHECK-NEXT: saddw v3.4s, v7.4s, v3.4h +; CHECK-NEXT: saddw v3.4s, v6.4s, v3.4h +; CHECK-NEXT: saddw v2.4s, v7.4s, v2.4h ; CHECK-NEXT: zip1 v4.4s, v1.4s, v0.4s ; CHECK-NEXT: trn1 v18.4s, v1.4s, v0.4s ; CHECK-NEXT: zip2 v0.4s, v1.4s, v0.4s -; CHECK-NEXT: uzp2 v5.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v6.16b, v2.16b -; CHECK-NEXT: mov v16.16b, v3.16b -; CHECK-NEXT: zip2 v7.4s, v2.4s, v3.4s -; CHECK-NEXT: mov v6.s[0], v3.s[1] +; CHECK-NEXT: uzp2 v5.4s, v3.4s, v2.4s +; CHECK-NEXT: mov v7.16b, v3.16b +; CHECK-NEXT: zip1 v6.4s, v2.4s, v3.4s +; CHECK-NEXT: zip2 v16.4s, v3.4s, v2.4s ; CHECK-NEXT: ext v17.16b, v1.16b, v4.16b, #8 -; CHECK-NEXT: mov v16.s[1], v2.s[0] -; CHECK-NEXT: uzp2 v1.4s, v5.4s, v2.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #12 -; CHECK-NEXT: zip2 v2.4s, v3.4s, v2.4s -; CHECK-NEXT: mov v7.d[1], v18.d[1] -; CHECK-NEXT: mov v6.d[1], v4.d[1] -; CHECK-NEXT: mov v16.d[1], v17.d[1] +; CHECK-NEXT: mov v7.s[0], v2.s[1] +; CHECK-NEXT: ext v1.16b, v3.16b, v3.16b, #12 +; CHECK-NEXT: uzp2 v5.4s, v5.4s, v3.4s +; CHECK-NEXT: zip2 v3.4s, v2.4s, v3.4s +; CHECK-NEXT: mov v16.d[1], v18.d[1] +; CHECK-NEXT: mov v6.d[1], v17.d[1] +; CHECK-NEXT: mov v7.d[1], v4.d[1] +; CHECK-NEXT: ext v1.16b, v2.16b, v1.16b, #12 +; CHECK-NEXT: mov v5.d[1], v0.d[1] +; CHECK-NEXT: mov v3.d[1], v18.d[1] +; CHECK-NEXT: add v2.4s, v7.4s, v6.4s ; CHECK-NEXT: mov v1.d[1], v0.d[1] -; CHECK-NEXT: ext v3.16b, v3.16b, v5.16b, #12 -; CHECK-NEXT: mov v2.d[1], v18.d[1] -; CHECK-NEXT: add v4.4s, v6.4s, v16.4s -; CHECK-NEXT: add v1.4s, v1.4s, v7.4s -; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: rev64 v5.4s, v4.4s -; CHECK-NEXT: rev64 v0.4s, v1.4s -; CHECK-NEXT: sub v2.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v3.4s, v16.4s, v6.4s -; CHECK-NEXT: mov v5.d[1], v4.d[1] -; CHECK-NEXT: mov v0.d[1], v1.d[1] -; CHECK-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-NEXT: sub v2.4s, v3.4s, v2.4s -; CHECK-NEXT: add v1.4s, v1.4s, v5.4s -; CHECK-NEXT: sub v0.4s, v4.4s, v0.4s -; CHECK-NEXT: zip1 v3.4s, v1.4s, v6.4s -; CHECK-NEXT: uzp2 v4.4s, v1.4s, v6.4s -; CHECK-NEXT: zip2 v16.4s, v1.4s, v6.4s -; CHECK-NEXT: zip1 v5.4s, v0.4s, v2.4s -; CHECK-NEXT: trn1 v7.4s, v0.4s, v2.4s -; CHECK-NEXT: zip2 v2.4s, v0.4s, v2.4s -; CHECK-NEXT: trn2 v3.4s, v1.4s, v3.4s -; CHECK-NEXT: uzp2 v4.4s, v4.4s, v1.4s -; CHECK-NEXT: mov v1.s[1], v6.s[1] +; CHECK-NEXT: add v4.4s, v5.4s, v16.4s +; CHECK-NEXT: rev64 v5.4s, v2.4s +; CHECK-NEXT: rev64 v0.4s, v4.4s +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: sub v3.4s, v6.4s, v7.4s +; CHECK-NEXT: mov v5.d[1], v2.d[1] +; CHECK-NEXT: add v6.4s, v1.4s, v3.4s +; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s +; CHECK-NEXT: mov v0.d[1], v4.d[1] +; CHECK-NEXT: add v4.4s, v4.4s, v5.4s +; CHECK-NEXT: sub v0.4s, v2.4s, v0.4s +; CHECK-NEXT: zip1 v2.4s, v4.4s, v6.4s +; CHECK-NEXT: uzp2 v3.4s, v4.4s, v6.4s +; CHECK-NEXT: zip2 v16.4s, v4.4s, v6.4s +; CHECK-NEXT: zip1 v5.4s, v0.4s, v1.4s +; CHECK-NEXT: trn1 v7.4s, v0.4s, v1.4s +; CHECK-NEXT: zip2 v1.4s, v0.4s, v1.4s +; CHECK-NEXT: trn2 v2.4s, v4.4s, v2.4s +; CHECK-NEXT: uzp2 v3.4s, v3.4s, v4.4s +; CHECK-NEXT: mov v4.s[1], v6.s[1] ; CHECK-NEXT: ext v0.16b, v0.16b, v5.16b, #8 ; CHECK-NEXT: mov v16.d[1], v7.d[1] -; CHECK-NEXT: mov v4.d[1], v2.d[1] -; CHECK-NEXT: mov v1.d[1], v5.d[1] -; CHECK-NEXT: mov v3.d[1], v0.d[1] -; CHECK-NEXT: add v0.4s, v16.4s, v4.4s -; CHECK-NEXT: sub v4.4s, v4.4s, v16.4s -; CHECK-NEXT: add v2.4s, v1.4s, v3.4s -; CHECK-NEXT: sub v1.4s, v3.4s, v1.4s -; CHECK-NEXT: ext v3.16b, v0.16b, v0.16b, #4 -; CHECK-NEXT: zip2 v6.4s, v0.4s, v4.4s -; CHECK-NEXT: zip2 v7.4s, v4.4s, v0.4s -; CHECK-NEXT: ext v5.16b, v2.16b, v2.16b, #4 -; CHECK-NEXT: zip2 v16.4s, v1.4s, v2.4s -; CHECK-NEXT: zip2 v17.4s, v2.4s, v1.4s -; CHECK-NEXT: zip1 v0.4s, v0.4s, v4.4s -; CHECK-NEXT: ext v18.16b, v3.16b, v4.16b, #8 -; CHECK-NEXT: ext v19.16b, v5.16b, v1.16b, #8 -; CHECK-NEXT: zip1 v1.4s, v2.4s, v1.4s +; CHECK-NEXT: mov v3.d[1], v1.d[1] +; CHECK-NEXT: mov v4.d[1], v5.d[1] +; CHECK-NEXT: mov v2.d[1], v0.d[1] +; CHECK-NEXT: add v0.4s, v16.4s, v3.4s +; CHECK-NEXT: sub v3.4s, v3.4s, v16.4s +; CHECK-NEXT: add v1.4s, v4.4s, v2.4s +; CHECK-NEXT: sub v2.4s, v2.4s, v4.4s +; CHECK-NEXT: ext v4.16b, v0.16b, v0.16b, #4 +; CHECK-NEXT: zip2 v6.4s, v0.4s, v3.4s +; CHECK-NEXT: zip2 v7.4s, v3.4s, v0.4s +; CHECK-NEXT: ext v5.16b, v1.16b, v1.16b, #4 +; CHECK-NEXT: zip2 v16.4s, v2.4s, v1.4s +; CHECK-NEXT: zip2 v17.4s, v1.4s, v2.4s +; CHECK-NEXT: zip1 v0.4s, v0.4s, v3.4s +; CHECK-NEXT: zip1 v1.4s, v1.4s, v2.4s +; CHECK-NEXT: ext v18.16b, v4.16b, v3.16b, #8 +; CHECK-NEXT: ext v19.16b, v5.16b, v2.16b, #8 ; CHECK-NEXT: add v2.4s, v16.4s, v7.4s -; CHECK-NEXT: sub v4.4s, v6.4s, v17.4s -; CHECK-NEXT: ext v3.16b, v18.16b, v3.16b, #4 +; CHECK-NEXT: sub v3.4s, v6.4s, v17.4s +; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s +; CHECK-NEXT: ext v4.16b, v18.16b, v4.16b, #4 +; CHECK-NEXT: cmlt v1.8h, v3.8h, #0 ; CHECK-NEXT: cmlt v6.8h, v2.8h, #0 ; CHECK-NEXT: ext v5.16b, v19.16b, v5.16b, #4 -; CHECK-NEXT: sub v0.4s, v0.4s, v1.4s -; CHECK-NEXT: cmlt v1.8h, v4.8h, #0 ; CHECK-NEXT: add v2.4s, v6.4s, v2.4s -; CHECK-NEXT: add v4.4s, v1.4s, v4.4s -; CHECK-NEXT: add v3.4s, v5.4s, v3.4s +; CHECK-NEXT: add v3.4s, v1.4s, v3.4s +; CHECK-NEXT: add v4.4s, v5.4s, v4.4s ; CHECK-NEXT: cmlt v5.8h, v0.8h, #0 +; CHECK-NEXT: eor v1.16b, v3.16b, v1.16b ; CHECK-NEXT: eor v2.16b, v2.16b, v6.16b -; CHECK-NEXT: eor v1.16b, v4.16b, v1.16b -; CHECK-NEXT: cmlt v7.8h, v3.8h, #0 +; CHECK-NEXT: cmlt v7.8h, v4.8h, #0 ; CHECK-NEXT: add v0.4s, v5.4s, v0.4s ; CHECK-NEXT: add v1.4s, v2.4s, v1.4s -; CHECK-NEXT: add v3.4s, v7.4s, v3.4s +; CHECK-NEXT: add v3.4s, v7.4s, v4.4s ; CHECK-NEXT: eor v0.16b, v0.16b, v5.16b ; CHECK-NEXT: eor v2.16b, v3.16b, v7.16b ; CHECK-NEXT: add v0.4s, v0.4s, v1.4s From 564b1771efb0bd3c25ed5282585e614617dd36de Mon Sep 17 00:00:00 2001 From: Philip Ginsbach-Chen Date: Sun, 9 Nov 2025 14:58:52 +0000 Subject: [PATCH 2/4] fix GISel for zip1/zip2 with flipped operands --- .../GISel/AArch64PostLegalizerLowering.cpp | 4 +- llvm/test/CodeGen/AArch64/arm64-zip.ll | 38 ++++++------------- 2 files changed, 14 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp index b1945dc76f269..4fba593b3d0fb 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerLowering.cpp @@ -259,8 +259,8 @@ bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI, if (!isZIPMask(ShuffleMask, NumElts, WhichResult, OperandOrder)) return false; unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2; - Register V1 = MI.getOperand(1).getReg(); - Register V2 = MI.getOperand(2).getReg(); + Register V1 = MI.getOperand(OperandOrder == 0 ? 1 : 2).getReg(); + Register V2 = MI.getOperand(OperandOrder == 0 ? 2 : 1).getReg(); MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2}); return true; } diff --git a/llvm/test/CodeGen/AArch64/arm64-zip.ll b/llvm/test/CodeGen/AArch64/arm64-zip.ll index c1d4a317cdf3f..44411a1032dca 100644 --- a/llvm/test/CodeGen/AArch64/arm64-zip.ll +++ b/llvm/test/CodeGen/AArch64/arm64-zip.ll @@ -356,38 +356,24 @@ define <8 x i16> @combine_v8i16_undef(<4 x i16> %0, <4 x i16> %1) { } define <16 x i8> @combine_v8i16_8first(<8 x i8> %0, <8 x i8> %1) { -; CHECK-SD-LABEL: combine_v8i16_8first: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: zip1.16b v0, v0, v1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_v8i16_8first: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: zip1.16b v0, v1, v0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: combine_v8i16_8first: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> ret <16 x i8> %3 } define <16 x i8> @combine_v8i16_8firstundef(<8 x i8> %0, <8 x i8> %1) { -; CHECK-SD-LABEL: combine_v8i16_8firstundef: -; CHECK-SD: // %bb.0: -; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-SD-NEXT: zip1.16b v0, v0, v1 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: combine_v8i16_8firstundef: -; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: zip1.16b v0, v1, v0 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: combine_v8i16_8firstundef: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: zip1.16b v0, v0, v1 +; CHECK-NEXT: ret %3 = shufflevector <8 x i8> %1, <8 x i8> %0, <16 x i32> ret <16 x i8> %3 } From 54dc23327a5ae653f5df3d73f81e94bd7b18b8fb Mon Sep 17 00:00:00 2001 From: Philip Ginsbach-Chen Date: Wed, 19 Nov 2025 19:23:53 +0000 Subject: [PATCH 3/4] introduce local variables EvenElt, OddElt --- .../lib/Target/AArch64/AArch64PerfectShuffle.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 4b8cc61cc7d33..171768c21d3b2 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6640,23 +6640,25 @@ inline bool isZIPMask(ArrayRef M, unsigned NumElts, // Check all elements match. for (unsigned i = 0; i != NumElts; i += 2) { if (M[i] >= 0) { - if ((unsigned)M[i] != i / 2) + unsigned EvenElt = (unsigned)M[i]; + if (EvenElt != i / 2) Variant0Order0 = false; - if ((unsigned)M[i] != NumElts / 2 + i / 2) + if (EvenElt != NumElts / 2 + i / 2) Variant1Order0 = false; - if ((unsigned)M[i] != NumElts + i / 2) + if (EvenElt != NumElts + i / 2) Variant0Order1 = false; - if ((unsigned)M[i] != NumElts + NumElts / 2 + i / 2) + if (EvenElt != NumElts + NumElts / 2 + i / 2) Variant1Order1 = false; } if (M[i + 1] >= 0) { - if ((unsigned)M[i + 1] != NumElts + i / 2) + unsigned OddElt = (unsigned)M[i + 1]; + if (OddElt != NumElts + i / 2) Variant0Order0 = false; - if ((unsigned)M[i + 1] != NumElts + NumElts / 2 + i / 2) + if (OddElt != NumElts + NumElts / 2 + i / 2) Variant1Order0 = false; - if ((unsigned)M[i + 1] != i / 2) + if (OddElt != i / 2) Variant0Order1 = false; - if ((unsigned)M[i + 1] != NumElts / 2 + i / 2) + if (OddElt != NumElts / 2 + i / 2) Variant1Order1 = false; } } From 5c515476ee979d430f44a21f597ad25397a62f41 Mon Sep 17 00:00:00 2001 From: Philip Ginsbach-Chen Date: Wed, 19 Nov 2025 19:27:30 +0000 Subject: [PATCH 4/4] comments explain WhichResultOut and OperandOrderOut --- llvm/lib/Target/AArch64/AArch64PerfectShuffle.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h index 171768c21d3b2..ef8786d0ad0e1 100644 --- a/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h +++ b/llvm/lib/Target/AArch64/AArch64PerfectShuffle.h @@ -6622,10 +6622,10 @@ inline unsigned getPerfectShuffleCost(llvm::ArrayRef M) { } /// Return true for zip1 or zip2 masks of the form: -/// <0, 8, 1, 9, 2, 10, 3, 11> or -/// <4, 12, 5, 13, 6, 14, 7, 15> or -/// <8, 0, 9, 1, 10, 2, 11, 3> or -/// <12, 4, 13, 5, 14, 6, 15, 7> +/// <0, 8, 1, 9, 2, 10, 3, 11> (WhichResultOut = 0, OperandOrderOut = 0) or +/// <4, 12, 5, 13, 6, 14, 7, 15> (WhichResultOut = 1, OperandOrderOut = 0) or +/// <8, 0, 9, 1, 10, 2, 11, 3> (WhichResultOut = 0, OperandOrderOut = 1) or +/// <12, 4, 13, 5, 14, 6, 15, 7> (WhichResultOut = 1, OperandOrderOut = 1) inline bool isZIPMask(ArrayRef M, unsigned NumElts, unsigned &WhichResultOut, unsigned &OperandOrderOut) { if (NumElts % 2 != 0) @@ -6633,10 +6633,10 @@ inline bool isZIPMask(ArrayRef M, unsigned NumElts, // "Variant" refers to the distinction bwetween zip1 and zip2, while // "Order" refers to sequence of input registers (matching vs flipped). - bool Variant0Order0 = true; - bool Variant1Order0 = true; - bool Variant0Order1 = true; - bool Variant1Order1 = true; + bool Variant0Order0 = true; // WhichResultOut = 0, OperandOrderOut = 0 + bool Variant1Order0 = true; // WhichResultOut = 1, OperandOrderOut = 0 + bool Variant0Order1 = true; // WhichResultOut = 0, OperandOrderOut = 1 + bool Variant1Order1 = true; // WhichResultOut = 1, OperandOrderOut = 1 // Check all elements match. for (unsigned i = 0; i != NumElts; i += 2) { if (M[i] >= 0) {