diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index 73f2c55a71125..86956d1c64451 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3206,6 +3206,11 @@ class LLVM_ABI TargetLoweringBase { /// Default to be the minimum interleave factor: 2. virtual unsigned getMaxSupportedInterleaveFactor() const { return 2; } + /// Return true if the target can interleave data with shuffles. + virtual bool isProfitableToInterleaveWithGatherScatter() const { + return false; + } + /// Lower an interleaved load to target specific intrinsics. Return /// true on success. /// diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index a6a9b5058ad94..c7d44c01f99f3 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -239,7 +239,8 @@ static bool isDeInterleaveMask(ArrayRef Mask, unsigned &Factor, /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...> /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7> static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, - unsigned MaxFactor) { + unsigned MaxFactor, + bool InterleaveWithShuffles) { unsigned NumElts = SVI->getShuffleMask().size(); if (NumElts < 4) return false; @@ -250,6 +251,13 @@ static bool isReInterleaveMask(ShuffleVectorInst *SVI, unsigned &Factor, return true; } + if (InterleaveWithShuffles) { + for (unsigned i = 1; MaxFactor * i <= 16; i *= 2) { + Factor = i * MaxFactor; + if (SVI->isInterleave(Factor)) + return true; + } + } return false; } @@ -530,7 +538,8 @@ bool InterleavedAccessImpl::lowerInterleavedStore( cast(SVI->getType())->getNumElements(); // Check if the shufflevector is RE-interleave shuffle. unsigned Factor; - if (!isReInterleaveMask(SVI, Factor, MaxFactor)) + if (!isReInterleaveMask(SVI, Factor, MaxFactor, + TLI->isProfitableToInterleaveWithGatherScatter())) return false; assert(NumStoredElements % Factor == 0 && "number of stored element should be a multiple of Factor"); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 662d84b7a60a8..3f482209ccd29 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #include @@ -18023,11 +18024,17 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, unsigned Factor, const APInt &GapMask) const { - assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && - "Invalid interleave factor"); auto *SI = dyn_cast(Store); if (!SI) return false; + + if (isProfitableToInterleaveWithGatherScatter() && + Factor > getMaxSupportedInterleaveFactor()) + return lowerInterleavedStoreWithShuffle(SI, SVI, Factor); + + assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() && + "Invalid interleave factor"); + assert(!LaneMask && GapMask.popcount() == Factor && "Unexpected mask on store"); @@ -18173,6 +18180,135 @@ bool AArch64TargetLowering::lowerInterleavedStore(Instruction *Store, return true; } +/// If the interleaved vector elements are greter than supported MaxFactor +/// then, interleaving the data with additional shuffles can be used to +/// achieve the same. +/// Below shows how 8 interleaved data are shuffled to store with stN +/// instructions. Data need store in this order v0,v1,v2,v3,v4,v5,v6,v7 +/// v0 v4 v2 v6 v1 v5 v3 v7 +/// | | | | | | | | +/// \ / \ / \ / \ / +/// [zip v0,v4] [zip v2,v6] [zip v1,v5] [zip v3,v7]==> stN = 4 +/// | | | | +/// \ / \ / +/// \ / \ / +/// \ / \ / +/// [zip [v0,v2,v4,v6]] [zip [v1,v3,v5,v7]] ==> stN = 2 +/// +/// In stN = 4 level upper half of interleaved data V0,V1,V2,V3 is store +/// withone st4 instruction. Lower half V4,V5,V6,V7 store with another st4. +/// +/// In stN = 2 level first upper half of interleaved data V0,V1 is store +/// with one st2 instruction. Second set V2,V3 with store with another st2. +/// Total of 4 st2 are required. +bool AArch64TargetLowering::lowerInterleavedStoreWithShuffle( + StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const { + unsigned MaxSupportedFactor = getMaxSupportedInterleaveFactor(); + + auto *VecTy = cast(SVI->getType()); + assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store"); + + unsigned LaneLen = VecTy->getNumElements() / Factor; + Type *EltTy = VecTy->getElementType(); + auto *SubVecTy = FixedVectorType::get(EltTy, Factor); + + const DataLayout &DL = SI->getModule()->getDataLayout(); + bool UseScalable; + + // Skip if we do not have NEON and skip illegal vector types. We can + // "legalize" wide vector types into multiple interleaved accesses as long as + // the vector types are divisible by 128. + if (!Subtarget->hasNEON() || + !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) + return false; + + if (UseScalable) + return false; + + std::deque Shuffles; + Shuffles.push_back(SVI); + unsigned ConcatLevel = Factor; + while (ConcatLevel > 1) { + std::deque ShufflesIntermediate; + ShufflesIntermediate = Shuffles; + Shuffles.clear(); + while (!ShufflesIntermediate.empty()) { + ShuffleVectorInst *SFL = + dyn_cast(ShufflesIntermediate.front()); + if (!SFL) + break; + ShufflesIntermediate.pop_front(); + + Value *Op0 = SFL->getOperand(0); + Value *Op1 = SFL->getOperand(1); + + Shuffles.push_back(dyn_cast(Op0)); + Shuffles.push_back(dyn_cast(Op1)); + } + if (!ShufflesIntermediate.empty()) { + Shuffles = ShufflesIntermediate; + break; + } + ConcatLevel = ConcatLevel >> 1; + } + + if (Shuffles.size() != Factor) + return false; + + IRBuilder<> Builder(SI); + auto Mask = createInterleaveMask(LaneLen, 2); + SmallVector UpperHalfMask, LowerHalfMask; + for (unsigned i = 0; i < (2 * LaneLen); i++) { + if (i < LaneLen) + LowerHalfMask.push_back(Mask[i]); + else + UpperHalfMask.push_back(Mask[i]); + } + + unsigned InterleaveFactor = Factor >> 1; + while (InterleaveFactor >= MaxSupportedFactor) { + std::deque ShufflesIntermediate; + for (unsigned j = 0; j < Factor; j += (InterleaveFactor * 2)) { + for (unsigned i = 0; i < InterleaveFactor; i++) { + auto *Shuffle = Builder.CreateShuffleVector( + Shuffles[i + j], Shuffles[i + j + InterleaveFactor], LowerHalfMask); + ShufflesIntermediate.push_back(Shuffle); + } + for (unsigned i = 0; i < InterleaveFactor; i++) { + auto *Shuffle = Builder.CreateShuffleVector( + Shuffles[i + j], Shuffles[i + j + InterleaveFactor], UpperHalfMask); + ShufflesIntermediate.push_back(Shuffle); + } + } + + Shuffles = ShufflesIntermediate; + InterleaveFactor = InterleaveFactor >> 1; + } + + Type *PtrTy = SI->getPointerOperandType(); + auto *STVTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); + + Value *BaseAddr = SI->getPointerOperand(); + Function *StNFunc = getStructuredStoreFunction( + SI->getModule(), MaxSupportedFactor, UseScalable, STVTy, PtrTy); + for (unsigned i = 0; i < (Factor / MaxSupportedFactor); i++) { + SmallVector Ops; + for (unsigned j = 0; j < MaxSupportedFactor; j++) + Ops.push_back(Shuffles[i * MaxSupportedFactor + j]); + + if (i > 0) { + // We will compute the pointer operand of each store from the original + // base address using GEPs. Cast the base address to a pointer to the + // scalar element type. + BaseAddr = Builder.CreateConstGEP1_32( + SubVecTy->getElementType(), BaseAddr, LaneLen * MaxSupportedFactor); + } + Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy)); + Builder.CreateCall(StNFunc, Ops); + } + return true; +} + bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Instruction *Load, Value *Mask, IntrinsicInst *DI) const { const unsigned Factor = getDeinterleaveIntrinsicFactor(DI->getIntrinsicID()); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 9495c9ffc47aa..867e01664eaae 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -229,6 +229,10 @@ class AArch64TargetLowering : public TargetLowering { bool hasPairedLoad(EVT LoadedType, Align &RequiredAlignment) const override; + bool isProfitableToInterleaveWithGatherScatter() const override { + return true; + } + unsigned getMaxSupportedInterleaveFactor() const override { return 4; } bool lowerInterleavedLoad(Instruction *Load, Value *Mask, @@ -239,6 +243,9 @@ class AArch64TargetLowering : public TargetLowering { ShuffleVectorInst *SVI, unsigned Factor, const APInt &GapMask) const override; + bool lowerInterleavedStoreWithShuffle(StoreInst *SI, ShuffleVectorInst *SVI, + unsigned Factor) const; + bool lowerDeinterleaveIntrinsicToLoad(Instruction *Load, Value *Mask, IntrinsicInst *DI) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 479e34515fc8a..f5a8610bf3548 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4801,11 +4801,36 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + unsigned NumLoadStores = 1; + InstructionCost ShuffleCost = 0; + bool isInterleaveWithShuffle = false; + unsigned MaxSupportedFactor = TLI->getMaxSupportedInterleaveFactor(); + + auto *SubVecTy = + VectorType::get(VecVTy->getElementType(), + VecVTy->getElementCount().divideCoefficientBy(Factor)); + + if (TLI->isProfitableToInterleaveWithGatherScatter() && + Opcode == Instruction::Store && (0 == Factor % MaxSupportedFactor) && + Factor > MaxSupportedFactor) { + isInterleaveWithShuffle = true; + SmallVector Mask; + // preparing interleave Mask. + for (unsigned i = 0; i < VecVTy->getElementCount().getKnownMinValue() / 2; + i++) { + for (unsigned j = 0; j < 2; j++) + Mask.push_back(j * Factor + i); + } + + NumLoadStores = Factor / MaxSupportedFactor; + ShuffleCost = + (Factor * getShuffleCost(TargetTransformInfo::SK_Splice, VecVTy, VecVTy, + Mask, CostKind, 0, SubVecTy)); + } + + if (!UseMaskForGaps && + (Factor <= MaxSupportedFactor || isInterleaveWithShuffle)) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); - auto *SubVecTy = - VectorType::get(VecVTy->getElementType(), - VecVTy->getElementCount().divideCoefficientBy(Factor)); // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be @@ -4813,7 +4838,10 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( bool UseScalable; if (MinElts % Factor == 0 && TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); + return (Factor * + TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable) * + NumLoadStores) + + ShuffleCost; } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll index 3685e9cf85bd6..6d0a0300e0a91 100644 --- a/llvm/test/CodeGen/AArch64/vldn_shuffle.ll +++ b/llvm/test/CodeGen/AArch64/vldn_shuffle.ll @@ -730,6 +730,109 @@ entry: ret void } +define void @store_factor8(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7) { +; CHECK-LABEL: store_factor8: +; CHECK: .Lfunc_begin17: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK: zip1 [[V1:.*s]], [[I1:.*s]], [[I5:.*s]] +; CHECK-NEXT: zip2 [[V5:.*s]], [[I1]], [[I5]] +; CHECK-NEXT: zip1 [[V2:.*s]], [[I2:.*s]], [[I6:.*s]] +; CHECK-NEXT: zip2 [[V6:.*s]], [[I2]], [[I6]] +; CHECK-NEXT: zip1 [[V3:.*s]], [[I3:.*s]], [[I7:.*s]] +; CHECK-NEXT: zip2 [[V7:.*s]], [[I3]], [[I7]] +; CHECK-NEXT: zip1 [[V4:.*s]], [[I4:.*s]], [[I8:.*s]] +; CHECK-NEXT: zip2 [[V8:.*s]], [[I4]], [[I8]] +; CHECK-NEXT: st4 { [[V1]], [[V2]], [[V3]], [[V4]] }, [x0], #64 +; CHECK-NEXT: st4 { [[V5]], [[V6]], [[V7]], [[V8]] }, [x0] +; CHECK-NEXT: ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + store <32 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + +define void @store_factor16(ptr %ptr, <4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2, <4 x i32> %a3, + <4 x i32> %a4, <4 x i32> %a5, <4 x i32> %a6, <4 x i32> %a7, + <4 x i32> %a8, <4 x i32> %a9, <4 x i32> %a10, <4 x i32> %a11, + <4 x i32> %a12, <4 x i32> %a13, <4 x i32> %a14, <4 x i32> %a15) { +; CHECK-LABEL: store_factor16: +; CHECK: .Lfunc_begin18: +; CHECK-NEXT: .cfi_startproc +; CHECK-NEXT: // %bb.0: +; CHECK: zip1 [[V05:.*s]], [[I05:.*s]], [[I13:.*s]] +; CHECK-NEXT: zip1 [[V01:.*s]], [[I01:.*s]], [[I09:.*s]] +; CHECK-NEXT: zip1 [[V02:.*s]], [[I02:.*s]], [[I10:.*s]] +; CHECK-NEXT: zip1 [[V06:.*s]], [[I06:.*s]], [[I14:.*s]] +; CHECK-NEXT: zip1 [[V07:.*s]], [[I07:.*s]], [[I15:.*s]] +; CHECK-NEXT: zip1 [[V08:.*s]], [[I08:.*s]], [[I16:.*s]] +; CHECK-NEXT: zip2 [[V09:.*s]], [[I01]], [[I09]] +; CHECK-NEXT: zip1 [[V03:.*s]], [[I03:.*s]], [[I11:.*s]] +; CHECK-NEXT: zip1 [[V04:.*s]], [[I04:.*s]], [[I12:.*s]] +; CHECK-NEXT: zip2 [[V11:.*s]], [[I03]], [[I11]] +; CHECK-NEXT: zip2 [[V12:.*s]], [[I04]], [[I12]] +; CHECK-NEXT: zip2 [[V13:.*s]], [[I05]], [[I13]] +; CHECK-NEXT: zip2 [[V10:.*s]], [[I02]], [[I10]] +; CHECK-NEXT: zip1 [[V17:.*s]], [[V01]], [[V05]] +; CHECK-NEXT: zip2 [[V21:.*s]], [[V01]], [[V05]] +; CHECK-NEXT: zip2 [[V14:.*s]], [[I06]], [[I14]] +; CHECK-NEXT: zip1 [[V18:.*s]], [[V02]], [[V06]] +; CHECK-NEXT: zip2 [[V22:.*s]], [[V02]], [[V06]] +; CHECK-NEXT: zip2 [[V15:.*s]], [[I07]], [[I15]] +; CHECK-NEXT: zip1 [[V19:.*s]], [[V03]], [[V07]] +; CHECK-NEXT: zip2 [[V23:.*s]], [[V03]], [[V07]] +; CHECK-NEXT: zip2 [[V16:.*s]], [[I08]], [[I16]] +; CHECK-NEXT: zip1 [[V20:.*s]], [[V04]], [[V08]] +; CHECK-NEXT: zip2 [[V24:.*s]], [[V04]], [[V08]] +; CHECK-NEXT: zip1 [[V25:.*s]], [[V09]], [[V13]] +; CHECK-NEXT: zip1 [[V26:.*s]], [[V10]], [[V14]] +; CHECK-NEXT: zip1 [[V27:.*s]], [[V11]], [[V15]] +; CHECK-NEXT: zip1 [[V28:.*s]], [[V12]], [[V16]] +; CHECK-NEXT: st4 { [[V17]], [[V18]], [[V19]], [[V20]] }, [x8], #64 +; CHECK-NEXT: ldp d9, d8, [sp, #16] // 16-byte Folded Reload +; CHECK-NEXT: st4 { [[V21]], [[V22]], [[V23]], [[V24]] }, [x8] +; CHECK-NEXT: zip2 [[V29:.*s]], [[V09]], [[V13]] +; CHECK-NEXT: add x8, x0, #128 +; CHECK-NEXT: zip2 [[V30:.*s]], [[V10]], [[V14]] +; CHECK-NEXT: zip2 [[V31:.*s]], [[V11]], [[V15]] +; CHECK-NEXT: zip2 [[V32:.*s]], [[V12]], [[V16]] +; CHECK-NEXT: st4 { [[V25]], [[V26]], [[V27]], [[V28]] }, [x8] +; CHECK-NEXT: add x8, x0, #192 +; CHECK-NEXT: st4 { [[V29]], [[V30]], [[V31]], [[V32]] }, [x8] +; CHECK-NEXT: ldp d11, d10, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + + %v0 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> + %v1 = shufflevector <4 x i32> %a2, <4 x i32> %a3, <8 x i32> + %v2 = shufflevector <4 x i32> %a4, <4 x i32> %a5, <8 x i32> + %v3 = shufflevector <4 x i32> %a6, <4 x i32> %a7, <8 x i32> + %v4 = shufflevector <4 x i32> %a8, <4 x i32> %a9, <8 x i32> + %v5 = shufflevector <4 x i32> %a10, <4 x i32> %a11, <8 x i32> + %v6 = shufflevector <4 x i32> %a12, <4 x i32> %a13, <8 x i32> + %v7 = shufflevector <4 x i32> %a14, <4 x i32> %a15, <8 x i32> + + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> + %s2 = shufflevector <8 x i32> %v4, <8 x i32> %v5, <16 x i32> + %s3 = shufflevector <8 x i32> %v6, <8 x i32> %v7, <16 x i32> + + %d0 = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> + %d1 = shufflevector <16 x i32> %s2, <16 x i32> %s3, <32 x i32> + + %interleaved.vec = shufflevector <32 x i32> %d0, <32 x i32> %d1, <64 x i32> + store <64 x i32> %interleaved.vec, ptr %ptr, align 4 + ret void +} + declare void @llvm.dbg.value(metadata, metadata, metadata) !llvm.dbg.cu = !{!0} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll new file mode 100644 index 0000000000000..bd5f4e2a3279b --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaved_store.ll @@ -0,0 +1,117 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=loop-vectorize -enable-interleaved-mem-accesses=true -max-interleave-group-factor=16 -S < %s | FileCheck %s + +define dso_local void @_Z6unpackPhS_(ptr noalias noundef readonly captures(none) %in, ptr noalias noundef writeonly captures(none) %out) { +; CHECK-LABEL: define dso_local void @_Z6unpackPhS_( +; CHECK-SAME: ptr noalias noundef readonly captures(none) [[IN:%.*]], ptr noalias noundef writeonly captures(none) [[OUT:%.*]]) { +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[OUT]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 4 +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[IN]], i64 [[OFFSET_IDX2]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i8>, ptr [[NEXT_GEP3]], align 1, !alias.scope [[META0:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i8> [[WIDE_VEC]], <16 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC5]] +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC4]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i8> [[STRIDED_VEC4]], [[STRIDED_VEC]] +; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i8> [[STRIDED_VEC5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = add <4 x i8> [[STRIDED_VEC6]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> zeroinitializer, <4 x i8> [[STRIDED_VEC6]], <8 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC5]], <4 x i8> [[TMP0]], <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC4]], <4 x i8> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[STRIDED_VEC]], <4 x i8> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP5]], <4 x i8> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> [[TMP8]], <8 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> [[TMP10]], <8 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP11]], <8 x i8> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i8> [[TMP13]], <8 x i8> [[TMP14]], <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <8 x i8> [[TMP15]], <8 x i8> [[TMP16]], <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <8 x i8> [[TMP17]], <8 x i8> [[TMP18]], <16 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <16 x i8> [[TMP19]], <16 x i8> [[TMP20]], <32 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[TMP21]], <16 x i8> [[TMP22]], <32 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <32 x i8> [[TMP23]], <32 x i8> [[TMP24]], <64 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <64 x i8> [[TMP25]], <64 x i8> poison, <64 x i32> +; CHECK-NEXT: store <64 x i8> [[INTERLEAVED_VEC]], ptr [[NEXT_GEP]], align 1, !alias.scope [[META3:![0-9]+]], !noalias [[META0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 +; CHECK-NEXT: br i1 [[TMP26]], label %[[MIDDLE_BLOCK:.*]], label %vector.body, !llvm.loop [[LOOP5:![0-9]+]] +; +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret void + +for.body: ; preds = %entry, %for.body + %i.033 = phi i32 [ 0, %entry ], [ %inc17, %for.body ] + %out.addr.032 = phi ptr [ %out, %entry ], [ %add.ptr, %for.body ] + %in.addr.031 = phi ptr [ %in, %entry ], [ %add.ptr15, %for.body ] + store i8 0, ptr %out.addr.032, align 1 + %arrayidx10 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 3 + %0 = load i8, ptr %arrayidx10, align 1 + %arrayidx14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 1 + store i8 %0, ptr %arrayidx14, align 1 + %arrayidx10.1 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 2 + %1 = load i8, ptr %arrayidx10.1, align 1 + %arrayidx14.1 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 2 + store i8 %1, ptr %arrayidx14.1, align 1 + %add.2 = add i8 %0, %1 + %arrayidx14.2 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 3 + store i8 %add.2, ptr %arrayidx14.2, align 1 + %arrayidx10.3 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 1 + %2 = load i8, ptr %arrayidx10.3, align 1 + %arrayidx14.3 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 4 + store i8 %2, ptr %arrayidx14.3, align 1 + %add.4 = add i8 %0, %2 + %arrayidx14.4 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 5 + store i8 %add.4, ptr %arrayidx14.4, align 1 + %add.5 = add i8 %1, %2 + %arrayidx14.5 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 6 + store i8 %add.5, ptr %arrayidx14.5, align 1 + %add.6 = add i8 %0, %add.5 + %arrayidx14.6 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 7 + store i8 %add.6, ptr %arrayidx14.6, align 1 + %3 = load i8, ptr %in.addr.031, align 1 + %arrayidx14.7 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 8 + store i8 %3, ptr %arrayidx14.7, align 1 + %add.8 = add i8 %0, %3 + %arrayidx14.8 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 9 + store i8 %add.8, ptr %arrayidx14.8, align 1 + %add.9 = add i8 %1, %3 + %arrayidx14.9 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 10 + store i8 %add.9, ptr %arrayidx14.9, align 1 + %add.10 = add i8 %0, %add.9 + %arrayidx14.10 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 11 + store i8 %add.10, ptr %arrayidx14.10, align 1 + %add.11 = add i8 %2, %3 + %arrayidx14.11 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 12 + store i8 %add.11, ptr %arrayidx14.11, align 1 + %add.12 = add i8 %0, %add.11 + %arrayidx14.12 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 13 + store i8 %add.12, ptr %arrayidx14.12, align 1 + %add.13 = add i8 %1, %add.11 + %arrayidx14.13 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 14 + store i8 %add.13, ptr %arrayidx14.13, align 1 + %add.14 = add i8 %0, %add.13 + %arrayidx14.14 = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 15 + store i8 %add.14, ptr %arrayidx14.14, align 1 + %add.ptr = getelementptr inbounds nuw i8, ptr %out.addr.032, i64 16 + %add.ptr15 = getelementptr inbounds nuw i8, ptr %in.addr.031, i64 4 + %inc17 = add nuw nsw i32 %i.033, 1 + %exitcond.not = icmp eq i32 %inc17, 32 + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !0 +} + +!0 = distinct !{!0, !1} +!1 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll index aff2c4cb644eb..c5da14743e3c8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/replicating-load-store-costs.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "scalar.ph:" --version 6 -; RUN: opt -p loop-vectorize -S %s | FileCheck %s +; RUN: opt -p loop-vectorize -max-interleave-group-factor=4 -S %s | FileCheck %s target triple = "arm64-apple-macosx15.0.0" diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll index 2dceb27165c4d..4450353e476e1 100644 --- a/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/interleave_vec.ll @@ -925,20 +925,20 @@ define void @same_op8_splat(ptr noalias noundef %a, ptr noundef %b, ptr noundef ; CHECK-SAME: ptr noalias noundef captures(none) [[A:%.*]], ptr noundef readonly captures(none) [[B:%.*]], ptr noundef readonly captures(none) [[C:%.*]]) local_unnamed_addr #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[C]], align 4 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x float> poison, float [[TMP0]], i64 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[BROADCAST_SPLATINSERT]], <2 x float> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT]], <4 x float> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x float>, ptr [[TMP5]], align 4 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <32 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <16 x float>, ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <16 x float> [[WIDE_VEC]], [[TMP1]] -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <16 x float> [[WIDE_VEC19]], [[TMP4]] -; CHECK-NEXT: store <16 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; CHECK-NEXT: [[WIDE_VEC19:%.*]] = load <32 x float>, ptr [[TMP6]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = fmul fast <32 x float> [[WIDE_VEC]], [[TMP1]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = fadd fast <32 x float> [[WIDE_VEC19]], [[TMP4]] +; CHECK-NEXT: store <32 x float> [[INTERLEAVED_VEC]], ptr [[TMP6]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], 144 ; CHECK-NEXT: br i1 [[TMP25]], label %[[FOR_END11:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: [[FOR_END11]]: