diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 4c932c523e423..e8e400030d281 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3232,6 +3232,9 @@ class LLVM_ABI TargetLoweringBase { /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } + /// Return true if the target has interleave with shuffles. + virtual bool hasInterleaveWithGatherScatter() const { return false; } + /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 5c27a20869f81..bfb875a01f29f 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef Mask, unsigned &Factor, /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, - unsigned MaxFactor) { + unsigned MaxFactor, + bool InterleaveWithShuffles) { unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; @@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return true; } + if (InterleaveWithShuffles) { + for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) { + Factor = i * MaxFactor; + if (SVI->isInterleave(Factor)) + return true; + } + } return false; } @@ -528,7 +536,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore( cast(SVI->getType())->getNumElements(); // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - if (!isReInterleaveMask(SVI, Factor, MaxFactor)) + if (!isReInterleaveMask(SVI, Factor, MaxFactor, + TLI->hasInterleaveWithGatherScatter())) return false; assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 8f41f230b5521..8601bd8350f79 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -97,6 +97,7 @@ #include #include #include +#include #include #include #include @@ -18144,12 +18145,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const { - - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && - "Invalid interleave factor"); + assert(Factor >= 2 && "Invalid interleave factor"); auto *SI = dyn_cast(Store); if (!SI) return false; + + if (Factor > getMaxSupportedInterleaveFactor()) + return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); + assert(!LaneMask && GapMask.popcount() == Factor && "Unexpected mask on store"); @@ -18295,6 +18298,160 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, return true; } +/// If the interleaved vector elements are greater than supported MaxFactor, +/// interleaving the data with additional shuffles can be used to +/// achieve the same. +/// +/// Consider the following data with 8 interleaves which are shuffled to store +/// stN instructions. Data needs to be stored in this order: +/// [v0, v1, v2, v3, v4, v5, v6, v7] +/// +/// v0 v4 v2 v6 v1 v5 v3 v7 +/// | | | | | | | | +/// \ / \ / \ / \ / +/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7] ==> stN = 4 +/// | | | | +/// \ / \ / +/// \ / \ / +/// \ / \ / +/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 +/// +/// For stN = 4, upper half of interleaved data V0, V1, V2, V3 is stored +/// with one st4 instruction. Lower half, i.e, V4, V5, V6, V7 is stored with +/// another st4. +/// +/// For stN = 2, upper half of interleaved data V0, V1 is stored +/// with one st2 instruction. Second set V2, V3 is stored with another st2. +/// Total of 4 st2's are required here. +bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( + StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { + unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); + + auto *VecTy = cast(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); + + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + bool UseScalable; + + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || + !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) + return false; + + if (UseScalable) + return false; + + std::deque Shuffles; + // If Only one operand is there in root shuffle. + if (isa(SVI->getOperand(1)) && + SVI->getType() == SVI->getOperand(0)->getType()) { + Value *Op0 = SVI->getOperand(0); + Shuffles.push_back(dyn_cast(Op0)); + } else + Shuffles.push_back(SVI); + unsigned ConcatLevel = Factor; + unsigned ConcatElt = Factor * LaneLen; + // Getting all the interleaved operands. + while (ConcatLevel > 1) { + unsigned InterleavedOperands = Shuffles.size(); + for (unsigned Ops = 0; Ops < InterleavedOperands; Ops++) { + auto *V = Shuffles.front(); + Shuffles.pop_front(); + if (isa(V)) { + VectorType *Ty = cast(V->getType()); + auto *HalfTy = VectorType::getHalfElementsVectorType(Ty); + Value *SplitValue = nullptr; + if (isa(V)) + SplitValue = ConstantAggregateZero::get(HalfTy); + else + SplitValue = PoisonValue::get(HalfTy); + + Shuffles.push_back(SplitValue); + Shuffles.push_back(SplitValue); + continue; + } + if (V->getType() == SubVecTy) { + Shuffles.push_back(V); + continue; + } + ShuffleVectorInst *SFL = dyn_cast(V); + if (!SFL) + return false; + if (SVI != SFL && !SFL->isIdentityMask(SFL->getShuffleMask(), ConcatElt)) + return false; + + Value *Op0 = SFL->getOperand(0); + Value *Op1 = SFL->getOperand(1); + + Shuffles.push_back(dyn_cast(Op0)); + Shuffles.push_back(dyn_cast(Op1)); + } + ConcatLevel >>= 1; + ConcatElt >>= 1; + } + + IRBuilder<> Builder(SI); + auto Mask = createInterleaveMask(LaneLen, 2); + SmallVector UpperHalfMask(LaneLen), LowerHalfMask(LaneLen); + for (unsigned Idx = 0; Idx < LaneLen; Idx++) { + LowerHalfMask[Idx] = Mask[Idx]; + UpperHalfMask[Idx] = Mask[Idx + LaneLen]; + } + + unsigned InterleaveFactor = Factor >> 1; + while (InterleaveFactor >= MaxSupportedFactor) { + std::deque ShufflesIntermediate; + ShufflesIntermediate.resize(Factor); + for (unsigned Idx = 0; Idx < Factor; Idx += (InterleaveFactor * 2)) { + for (unsigned GroupIdx = 0; GroupIdx < InterleaveFactor; GroupIdx++) { + assert(Shuffles[Idx + GroupIdx]->getType() == SubVecTy && + Shuffles[Idx + GroupIdx + InterleaveFactor]->getType() == + SubVecTy && + "Type of interleaving candidates are not matching\n"); + auto *Shuffle = Builder.CreateShuffleVector( + Shuffles[Idx + GroupIdx], + Shuffles[Idx + GroupIdx + InterleaveFactor], LowerHalfMask); + ShufflesIntermediate[Idx + GroupIdx] = Shuffle; + Shuffle = Builder.CreateShuffleVector( + Shuffles[Idx + GroupIdx], + Shuffles[Idx + GroupIdx + InterleaveFactor], UpperHalfMask); + ShufflesIntermediate[Idx + GroupIdx + InterleaveFactor] = Shuffle; + } + } + Shuffles = ShufflesIntermediate; + InterleaveFactor >>= 1; + } + + Type *PtrTy = SI->getPointerOperandType(); + auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); + + Value *BaseAddr = SI->getPointerOperand(); + Function *StNFunc = getStructuredStoreFunction( + SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); + for (unsigned N = 0; N < (Factor / MaxSupportedFactor); N++) { + SmallVector Ops; + for (unsigned OpIdx = 0; OpIdx < MaxSupportedFactor; OpIdx++) + Ops.push_back(Shuffles[N * MaxSupportedFactor + OpIdx]); + + if (N > 0) { + // We will compute the pointer operand of each store from the original + // base address using GEPs. Cast the base address to a pointer to the + // scalar element type. + BaseAddr = Builder.CreateConstGEP1_32( + SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); + } + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } + return true; +} + bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index be198e54cbcbf..fb1fe826fa9ff 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,6 +229,8 @@ class AArch64TargetLowering : public TargetLowering { bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override; + bool hasInterleaveWithGatherScatter() const override { return true; } + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(Instruction *Load, Value *Mask, @@ -239,6 +241,9 @@ class AArch64TargetLowering : public TargetLowering { ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override; + bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0bae00bafee3c..9f182c0d29f04 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4932,11 +4932,28 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + unsigned NumLoadStores = 1; + InstructionCost ShuffleCost = 0; + bool isInterleaveWithShuffle = false; + unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor(); + + auto *SubVecTy = + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); + + if (TLI->hasInterleaveWithGatherScatter() && Opcode == Instruction::Store && + (0 == Factor % MaxSupportedFactor) && Factor > MaxSupportedFactor) { + isInterleaveWithShuffle = true; + + NumLoadStores = Factor / MaxSupportedFactor; + ShuffleCost = + (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy, + {}, CostKind, 0, SubVecTy)); + } + + if (!UseMaskForGaps && + (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); - auto *SubVecTy = - VectorType::get(VecVTy->getElementType(), - VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -4944,7 +4961,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( bool UseScalable; if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); + return (Factor * + TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) * + NumLoadStores) + + ShuffleCost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 2f490c8f3f20f..d7528aac85a10 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -730,6 +730,285 @@ entry: ret void } +define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) { +; CHECK-LABEL: store_factor8: +; CHECK: .Lfunc_begin17: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]] +; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]] +; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]] +; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]] +; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]] +; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]] +; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]] +; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]] +; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64 +; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0] +; CHECK-NEXT: ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define dso_local void @store_factor8_1(ptr %dst, ptr %temp1, i64 %offset.idx, <8 x i16> %x, <8 x i16> %y, <8 x i16> %z) { +; CHECK-LABEL: store_factor8_1: +; CHECK: .Lfunc_begin18: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ld1r { v3.8h }, [x1] +; CHECK-NEXT: movi v4.8h, #1 +; CHECK-NEXT: movi v19.2d, #0000000000000000 +; CHECK-NEXT: add x8, x0, x2, lsl #1 +; CHECK-NEXT: orr v3.8h, #1 +; CHECK-NEXT: add v3.8h, v3.8h, v4.8h +; CHECK-NEXT: zip1 v16.8h, v19.8h, v3.8h +; CHECK-NEXT: zip1 v17.8h, v0.8h, v19.8h +; CHECK-NEXT: zip1 v18.8h, v1.8h, v2.8h +; CHECK-NEXT: zip2 v3.8h, v19.8h, v3.8h +; CHECK-NEXT: zip2 v4.8h, v0.8h, v19.8h +; CHECK-NEXT: zip2 v5.8h, v1.8h, v2.8h +; CHECK-NEXT: mov v6.16b, v19.16b +; CHECK-NEXT: st4 { v16.8h, v17.8h, v18.8h, v19.8h }, [x8], #64 +; CHECK-NEXT: st4 { v3.8h, v4.8h, v5.8h, v6.8h }, [x8] +; CHECK-NEXT: ret +entry: + %0 = load i32, ptr %temp1, align 4 + %broadcast.splatinsert1 = insertelement <8 x i32> poison, i32 %0, i64 0 + %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> poison, <8 x i32> zeroinitializer + %1 = getelementptr i16, ptr %dst, i64 %offset.idx + %2 = trunc <8 x i32> %broadcast.splat2 to <8 x i16> + %3 = or <8 x i16> %2, splat (i16 1) + %4 = add <8 x i16> %3, splat (i16 1) + %5 = shufflevector <8 x i16> zeroinitializer, <8 x i16> %x, <16 x i32> + %6 = shufflevector <8 x i16> %y, <8 x i16> zeroinitializer, <16 x i32> + %7 = shufflevector <8 x i16> %4, <8 x i16> zeroinitializer, <16 x i32> + %8 = shufflevector <8 x i16> %z, <8 x i16> zeroinitializer, <16 x i32> + %9 = shufflevector <16 x i16> %5, <16 x i16> %6, <32 x i32> + %10 = shufflevector <16 x i16> %7, <16 x i16> %8, <32 x i32> + %11 = shufflevector <32 x i16> %9, <32 x i16> %10, <64 x i32> + %interleaved.vec = shufflevector <64 x i16> %11, <64 x i16> poison, <64 x i32> + store <64 x i16> %interleaved.vec, ptr %1, align 2 + ret void +} + +define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7, + <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11, + <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) { +; CHECK-LABEL: store_factor16: +; CHECK: .Lfunc_begin19: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]] +; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]] +; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]] +; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]] +; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]] +; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]] +; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]] +; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]] +; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]] +; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]] +; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]] +; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]] +; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]] +; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]] +; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]] +; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]] +; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]] +; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]] +; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]] +; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]] +; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]] +; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]] +; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]] +; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]] +; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]] +; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]] +; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]] +; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]] +; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]] +; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]] +; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]] +; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]] +; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64 +; CHECK-NEXT: ldp d9, d8, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp d11, d10, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8] +; CHECK-NEXT: add x8, x0, #128 +; CHECK-NEXT: ldp d13, d12, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8] +; CHECK-NEXT: add x8, x0, #192 +; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8] +; CHECK-NEXT: ldp d15, d14, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> + %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> + %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> + %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> + %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> + + %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> + + %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> + store <64 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define dso_local void @store_no_interleave(ptr noalias noundef readonly captures(none) %a, ptr noalias noundef readonly captures(none) %b, i8 noundef %c) { +; CHECK-LABEL: store_no_interleave: +; CHECK: .Lfunc_begin20: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: movi v0.4h, #1 +; CHECK-NEXT: fmov s1, w2 +; CHECK-NEXT: ldrb w8, [x0] +; CHECK-NEXT: adrp x9, .LCPI20_3 +; CHECK-NEXT: ldr q3, [x9, :lo12:.LCPI20_3] +; CHECK-NEXT: adrp x9, .LCPI20_1 +; CHECK-NEXT: ldr q5, [x9, :lo12:.LCPI20_1] +; CHECK-NEXT: and v0.8b, v1.8b, v0.8b +; CHECK-NEXT: dup v0.16b, v0.b[0] +; CHECK-NEXT: dup v1.16b, w2 +; CHECK-NEXT: mov v2.16b, v1.16b +; CHECK-NEXT: tbl v3.16b, { v0.16b, v1.16b }, v3.16b +; CHECK-NEXT: tbl v5.16b, { v0.16b, v1.16b }, v5.16b +; CHECK-NEXT: mov v2.b[2], w8 +; CHECK-NEXT: mov v2.b[10], w8 +; CHECK-NEXT: adrp x8, .LCPI20_2 +; CHECK-NEXT: ldr q4, [x8, :lo12:.LCPI20_2] +; CHECK-NEXT: adrp x8, .LCPI20_0 +; CHECK-NEXT: ldr q6, [x8, :lo12:.LCPI20_0] +; CHECK-NEXT: ldrsw x8, [x1] +; CHECK-NEXT: tbl v4.16b, { v0.16b, v1.16b }, v4.16b +; CHECK-NEXT: rev64 v2.4s, v2.4s +; CHECK-NEXT: tbl v0.16b, { v0.16b, v1.16b }, v6.16b +; CHECK-NEXT: lsl x8, x8, #3 +; CHECK-NEXT: trn2 v1.4s, v2.4s, v3.4s +; CHECK-NEXT: trn2 v3.4s, v2.4s, v4.4s +; CHECK-NEXT: trn2 v4.4s, v2.4s, v5.4s +; CHECK-NEXT: trn2 v0.4s, v2.4s, v0.4s +; CHECK-NEXT: stp q3, q1, [x8, #-32] +; CHECK-NEXT: stp q0, q4, [x8, #-64] +; CHECK-NEXT: ret +entry: + %b.promoted = load i32, ptr %b, align 4 + %0 = insertelement <2 x i8> poison, i8 %c, i64 0 + %1 = shufflevector <2 x i8> %0, <2 x i8> poison, <4 x i32> zeroinitializer + %2 = and <4 x i8> %1, + %3 = load i32, ptr %a, align 4 + %conv18 = trunc i32 %3 to i8 + %4 = sext i32 %b.promoted to i64 + %5 = add nsw i64 %4, -1 + %n.vec30 = and i64 %5, -8 + %broadcast.splatinsert33 = insertelement <8 x i8> poison, i8 %conv18, i64 0 + %broadcast.splat34 = shufflevector <8 x i8> %broadcast.splatinsert33, <8 x i8> poison, <8 x i32> zeroinitializer + %broadcast.splatinsert35 = insertelement <8 x i8> poison, i8 %c, i64 0 + %broadcast.splat36 = shufflevector <8 x i8> %broadcast.splatinsert35, <8 x i8> poison, <8 x i32> zeroinitializer + %15 = shufflevector <8 x i8> %broadcast.splatinsert35, <8 x i8> poison, <16 x i32> + %16 = shufflevector <8 x i8> %broadcast.splat34, <8 x i8> %broadcast.splat36, <16 x i32> + %17 = shufflevector <4 x i8> %2, <4 x i8> poison, <16 x i32> zeroinitializer + %18 = shufflevector <8 x i8> %broadcast.splatinsert35, <8 x i8> poison, <16 x i32> zeroinitializer + %19 = shufflevector <16 x i8> %15, <16 x i8> %16, <32 x i32> + %20 = shufflevector <16 x i8> %17, <16 x i8> %18, <32 x i32> + %interleaved.vec45 = shufflevector <32 x i8> %19, <32 x i8> %20, <64 x i32> + %offset.idx = sub i64 %5, 0 + %21 = shl nsw i64 %offset.idx, 3 + %22 = getelementptr i8, ptr null, i64 %21 + %23 = getelementptr i8, ptr %22, i64 -56 + store <64 x i8> %interleaved.vec45, ptr %23, align 8 + ret void +} + +define dso_local void @store_no_interleave1(ptr %a, ptr %b, ptr %c, ptr %d, ptr %e, ptr %f, ptr %g, ptr %h, ptr %i, +; CHECK-LABEL: store_no_interleave1: +; CHECK: .Lfunc_begin21: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: // %entry +; CHECK-NEXT: ldr x8, [sp] +; CHECK-NEXT: ldr d3, [x8] +; CHECK-NEXT: zip1 v4.4h, v3.4h, v0.4h +; CHECK-NEXT: zip2 v0.4h, v3.4h, v0.4h +; CHECK-NEXT: stp d4, d0, [x0] +; CHECK-NEXT: ldr d0, [x8] +; CHECK-NEXT: zip1 v3.4h, v0.4h, v1.4h +; CHECK-NEXT: zip2 v1.4h, v0.4h, v1.4h +; CHECK-NEXT: zip2 v0.4h, v0.4h, v2.4h +; CHECK-NEXT: stp d3, d1, [x0] +; CHECK-NEXT: zip2 v5.2s, v4.2s, v3.2s +; CHECK-NEXT: ldr d1, [x8] +; CHECK-NEXT: zip1 v1.4h, v1.4h, v2.4h +; CHECK-NEXT: zip1 v2.2s, v4.2s, v3.2s +; CHECK-NEXT: stp d1, d0, [x0] +; CHECK-NEXT: zip2 v0.2s, v3.2s, v1.2s +; CHECK-NEXT: str d4, [x1] +; CHECK-NEXT: zip1 v4.2s, v3.2s, v1.2s +; CHECK-NEXT: str d3, [x2] +; CHECK-NEXT: stp d2, d5, [x3] +; CHECK-NEXT: mov v5.d[1], v0.d[0] +; CHECK-NEXT: str d3, [x4] +; CHECK-NEXT: str d1, [x5] +; CHECK-NEXT: stp d4, d0, [x6] +; CHECK-NEXT: str q5, [x7] +; CHECK-NEXT: ret + <4 x i16> %j, <4 x i16> %k, <4 x i16> %l) local_unnamed_addr #0 { +entry: + %0 = load <4 x i16>, ptr %i, align 8 + %vzip.i = shufflevector <4 x i16> %0, <4 x i16> %j, <4 x i32> + store <4 x i16> %vzip.i, ptr %a, align 8 + %vzip1.i = shufflevector <4 x i16> %0, <4 x i16> %j, <4 x i32> + %1 = getelementptr inbounds nuw i8, ptr %a, i64 8 + store <4 x i16> %vzip1.i, ptr %1 , align 8 + %2 = load <4 x i16>, ptr %i, align 8 + %vzip.i22 = shufflevector <4 x i16> %2, <4 x i16> %k, <4 x i32> + store <4 x i16> %vzip.i22, ptr %a, align 8 + %vzip1.i23 = shufflevector <4 x i16> %2, <4 x i16> %k, <4 x i32> + store <4 x i16> %vzip1.i23, ptr %1, align 8 + %3 = load <4 x i16>, ptr %i, align 8 + %vzip.i26 = shufflevector <4 x i16> %3, <4 x i16> %l, <4 x i32> + store <4 x i16> %vzip.i26, ptr %a, align 8 + %vzip1.i27 = shufflevector <4 x i16> %2, <4 x i16> %l, <4 x i32> + store <4 x i16> %vzip1.i27, ptr %1, align 8 + store <4 x i16> %vzip.i, ptr %b, align 8 + store <4 x i16> %vzip.i22, ptr %c, align 8 + %4 = shufflevector <4 x i16> %vzip.i, <4 x i16> %vzip.i22, <4 x i32> + %5 = shufflevector <4 x i16> %vzip.i, <4 x i16> %vzip.i22, <4 x i32> + store <4 x i16> %4, ptr %d, align 8 + %6 = getelementptr inbounds nuw i8, ptr %d, i64 8 + store <4 x i16> %5, ptr %6, align 8 + store <4 x i16> %vzip.i22, ptr %e, align 8 + store <4 x i16> %vzip.i26, ptr %f, align 8 + %7 = shufflevector <4 x i16> %vzip.i22, <4 x i16> %vzip.i26, <4 x i32> + %8 = shufflevector <4 x i16> %vzip.i22, <4 x i16> %vzip.i26, <4 x i32> + store <4 x i16> %7, ptr %g, align 8 + %9 = getelementptr inbounds nuw i8, ptr %g, i64 8 + store <4 x i16> %8, ptr %9, align 8 + %10 = shufflevector <4 x i16> %5, <4 x i16> %8, <8 x i32> + store <8 x i16> %10, ptr %h, align 16 + ret void +} + declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll index 14986d9eb85c5..f3c19b6994c13 100644 --- a/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses-inseltpoison.ll @@ -753,3 +753,95 @@ define <4 x i1> @load_large_vector(ptr %p) { %ret = icmp ne <4 x ptr> %s1, %s2 ret <4 x i1> %ret } + +define void @store_factor8_with_undef(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3){ +; NEON-LABEL: define void @store_factor8_with_undef( +; NEON: [[TMP1:%.*]] = shufflevector <4 x i32> [[A0:%.*]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A0]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A1:%.*]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[A1]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[A2:%.*]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[A2]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[A3:%.*]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[A3]], <4 x i32> poison, <4 x i32> +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP1]], <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <4 x i32> [[TMP7]], ptr [[PTR]]) +; NEON-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[PTR]], i32 16 +; NEON-NEXT: call void @llvm.aarch64.neon.st4.v4i32.p0(<4 x i32> [[TMP2]], <4 x i32> [[TMP4]], <4 x i32> [[TMP6]], <4 x i32> [[TMP8]], ptr [[TMP9]]) +; NEON-NEXT: ret void +; NO_NEON-LABEL: @store_factor8_with_undef( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> poison, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_general_mask_factor8_undef_fail(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7){ +; NEON-LABEL: @store_general_mask_factor8_undef_fail( +; NEON-NOT: @llvm.aarch64.neon +; NEON: ret void +; NO_NEON-LABEL: @store_general_mask_factor8_undef_fail( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_general_invalid_concat_mask(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7){ +; NEON-LABEL: @store_general_invalid_concat_mask( +; NEON-NOT: @llvm.aarch64.neon +; NEON: ret void +; NO_NEON-LABEL: @store_general_invalid_concat_mask( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_no_interleave_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7){ +; NEON-LABEL: @store_no_interleave_factor8( +; NEON-NOT: @llvm.aarch64.neon +; NEON: ret void +; NO_NEON-LABEL: @store_no_interleave_factor8( +; NO_NEON-NOT: @llvm.aarch64.neon +; NO_NEON: ret void +; + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <16 x i32> + store <16 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} \ No newline at end of file diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll new file mode 100644 index 0000000000000..88e184ca1ae7d --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16 -S < %s | FileCheck %s + +define dso_local void @interleavedstore_16(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) { +; CHECK-LABEL: define dso_local void @interleavedstore_16( +; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) { +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ] + %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ] + %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ] + store i8 0, ptr %out.addr.032, align 1 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3 + %0 = load i8, ptr %arrayidx10, align 1 + %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1 + store i8 %0, ptr %arrayidx14, align 1 + %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2 + %1 = load i8, ptr %arrayidx10.1, align 1 + %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2 + store i8 %1, ptr %arrayidx14.1, align 1 + %add.2 = add i8 %0, %1 + %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3 + store i8 %add.2, ptr %arrayidx14.2, align 1 + %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1 + %2 = load i8, ptr %arrayidx10.3, align 1 + %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4 + store i8 %2, ptr %arrayidx14.3, align 1 + %add.4 = add i8 %0, %2 + %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5 + store i8 %add.4, ptr %arrayidx14.4, align 1 + %add.5 = add i8 %1, %2 + %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6 + store i8 %add.5, ptr %arrayidx14.5, align 1 + %add.6 = add i8 %0, %add.5 + %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7 + store i8 %add.6, ptr %arrayidx14.6, align 1 + %3 = load i8, ptr %in.addr.031, align 1 + %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8 + store i8 %3, ptr %arrayidx14.7, align 1 + %add.8 = add i8 %0, %3 + %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9 + store i8 %add.8, ptr %arrayidx14.8, align 1 + %add.9 = add i8 %1, %3 + %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10 + store i8 %add.9, ptr %arrayidx14.9, align 1 + %add.10 = add i8 %0, %add.9 + %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11 + store i8 %add.10, ptr %arrayidx14.10, align 1 + %add.11 = add i8 %2, %3 + %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12 + store i8 %add.11, ptr %arrayidx14.11, align 1 + %add.12 = add i8 %0, %add.11 + %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13 + store i8 %add.12, ptr %arrayidx14.12, align 1 + %add.13 = add i8 %1, %add.11 + %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14 + store i8 %add.13, ptr %arrayidx14.13, align 1 + %add.14 = add i8 %0, %add.13 + %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15 + store i8 %add.14, ptr %arrayidx14.14, align 1 + %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16 + %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4 + %inc17 = add nuw nsw i32 %i.033, 1 + %exitcond.not = icmp eq i32 %inc17, 32 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index 68cfc659e1e94..c4c79d7fc63a2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -335,50 +335,50 @@ define void @scalar_store_cost_after_discarding_interleave_group(ptr %dst, i32 % ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TEMP1:%.*]] = alloca [64 x i32], align 4 ; CHECK-NEXT: call void @init(ptr [[TEMP1]]) -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[TMP21:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT]], <8 x i32> poison, <8 x i32> zeroinitializer +; CHECK-NEXT: [[TMP0:%.*]] = lshr <8 x i32> [[BROADCAST_SPLAT]], splat (i32 1) +; CHECK-NEXT: [[TMP1:%.*]] = mul <8 x i32> [[BROADCAST_SPLAT]], splat (i32 -171254) +; CHECK-NEXT: [[TMP2:%.*]] = lshr <8 x i32> [[TMP1]], splat (i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP3]], splat (i32 1) +; CHECK-NEXT: [[TMP5:%.*]] = lshr <8 x i32> [[TMP4]], splat (i32 1) +; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i32> [[TMP5]] to <8 x i16> +; CHECK-NEXT: [[TMP7:%.*]] = sub <8 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP8:%.*]] = lshr <8 x i32> [[TMP7]], splat (i32 1) +; CHECK-NEXT: [[TMP9:%.*]] = trunc <8 x i32> [[TMP8]] to <8 x i16> +; CHECK-NEXT: [[TMP10:%.*]] = or <8 x i32> [[BROADCAST_SPLAT]], splat (i32 1) +; CHECK-NEXT: [[TMP11:%.*]] = add <8 x i32> [[TMP10]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = lshr <8 x i32> [[TMP11]], splat (i32 1) +; CHECK-NEXT: [[TMP13:%.*]] = trunc <8 x i32> [[TMP12]] to <8 x i16> +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP22:%.*]] = load i32, ptr [[TEMP1]], align 4 -; CHECK-NEXT: [[SHR_0:%.*]] = lshr i32 [[X]], 1 -; CHECK-NEXT: [[MUL_0:%.*]] = mul i32 [[X]], -171254 -; CHECK-NEXT: [[SHR_1:%.*]] = lshr i32 [[MUL_0]], 1 -; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[SHR_0]], [[SHR_1]] -; CHECK-NEXT: [[TMP30:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] -; CHECK-NEXT: store i16 0, ptr [[TMP30]], align 2 -; CHECK-NEXT: [[GEP_0_1:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr i8, ptr [[GEP_0_1]], i64 14 -; CHECK-NEXT: store i16 0, ptr [[TMP38]], align 2 -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], 1 -; CHECK-NEXT: [[SHR_2:%.*]] = lshr i32 [[ADD_1]], 1 -; CHECK-NEXT: [[TMP54:%.*]] = trunc i32 [[SHR_2]] to i16 -; CHECK-NEXT: [[TMP46:%.*]] = getelementptr i8, ptr [[TMP30]], i64 2 -; CHECK-NEXT: store i16 [[TMP54]], ptr [[TMP46]], align 2 -; CHECK-NEXT: [[SUB_0:%.*]] = sub i32 0, [[MUL_0]] -; CHECK-NEXT: [[SHR_3:%.*]] = lshr i32 [[SUB_0]], 1 -; CHECK-NEXT: [[TMP70:%.*]] = trunc i32 [[SHR_3]] to i16 -; CHECK-NEXT: [[TMP62:%.*]] = getelementptr i8, ptr [[TMP30]], i64 12 -; CHECK-NEXT: store i16 [[TMP70]], ptr [[TMP62]], align 2 -; CHECK-NEXT: [[OR_0:%.*]] = or i32 [[X]], 1 -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[OR_0]], 1 -; CHECK-NEXT: [[SHR_4:%.*]] = lshr i32 [[ADD_2]], 1 -; CHECK-NEXT: [[TMP86:%.*]] = trunc i32 [[SHR_4]] to i16 -; CHECK-NEXT: [[TMP78:%.*]] = getelementptr i8, ptr [[TMP30]], i64 4 -; CHECK-NEXT: store i16 [[TMP86]], ptr [[TMP78]], align 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP22]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i32> [[BROADCAST_SPLATINSERT1]], <8 x i32> poison, <8 x i32> zeroinitializer ; CHECK-NEXT: [[GEP_0_2:%.*]] = getelementptr i16, ptr [[DST]], i64 [[TMP21]] -; CHECK-NEXT: [[TMP94:%.*]] = getelementptr i8, ptr [[GEP_0_2]], i64 10 -; CHECK-NEXT: store i16 0, ptr [[TMP94]], align 2 -; CHECK-NEXT: [[TRUNC_3:%.*]] = trunc i32 [[TMP22]] to i16 -; CHECK-NEXT: [[OR_1:%.*]] = or i16 [[TRUNC_3]], 1 -; CHECK-NEXT: [[TMP113:%.*]] = add i16 [[OR_1]], 1 -; CHECK-NEXT: [[TMP105:%.*]] = getelementptr i8, ptr [[TMP30]], i64 8 -; CHECK-NEXT: store i16 [[TMP113]], ptr [[TMP105]], align 2 -; CHECK-NEXT: [[TMP121:%.*]] = getelementptr i8, ptr [[TMP30]], i64 6 -; CHECK-NEXT: store i16 0, ptr [[TMP121]], align 2 -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[TMP21]], 8 -; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[TMP21]], 128 -; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: ret void +; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i32> [[BROADCAST_SPLAT2]] to <8 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = or <8 x i16> [[TMP16]], splat (i16 1) +; CHECK-NEXT: [[TMP18:%.*]] = add <8 x i16> [[TMP17]], splat (i16 1) +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i16> zeroinitializer, <8 x i16> [[TMP6]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i16> [[TMP13]], <8 x i16> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i16> [[TMP18]], <8 x i16> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> zeroinitializer, <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i16> [[TMP19]], <16 x i16> [[TMP20]], <32 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP27]], <16 x i16> [[TMP28]], <32 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i16> [[TMP23]], <32 x i16> [[TMP24]], <64 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i16> [[TMP25]], <64 x i16> poison, <64 x i32> +; CHECK-NEXT: store <64 x i16> [[INTERLEAVED_VEC]], ptr [[GEP_0_2]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH:.*]] +; CHECK: [[SCALAR_PH]]: ; entry: %temp1 = alloca [64 x i32], align 4 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index 2dceb27165c4d..4450353e476e1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]] -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]] -; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]] +; CHECK-NEXT: store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144 ; CHECK-NEXT: br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[FOR_END11]]: