diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h index e0ade02959025..e233d430e98dd 100644 --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -59,6 +59,8 @@ #include #include #include +#include +#include namespace llvm { @@ -3145,6 +3147,7 @@ class TargetLoweringBase { /// \p DI is the deinterleave intrinsic. /// \p LI is the accompanying load instruction virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + std::queue>& LeafNodes, LoadInst *LI) const { return false; } @@ -3156,6 +3159,7 @@ class TargetLoweringBase { /// \p II is the interleave intrinsic. /// \p SI is the accompanying store instruction virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + std::queue& LeafNodes, StoreInst *SI) const { return false; } diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp index 438ac1c3cc6e2..73c3a63b61da3 100644 --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -71,6 +71,7 @@ #include "llvm/Transforms/Utils/Local.h" #include #include +#include using namespace llvm; @@ -510,12 +511,52 @@ bool InterleavedAccessImpl::lowerDeinterleaveIntrinsic( LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + std::stack DeinterleaveTreeQueue; + std::queue> LeafNodes; + std::mapmp; + SmallVector TempDeadInsts; + + DeinterleaveTreeQueue.push(DI); + unsigned DILeafCount = 0; + while(!DeinterleaveTreeQueue.empty()) { + auto CurrentDI = DeinterleaveTreeQueue.top(); + DeinterleaveTreeQueue.pop(); + TempDeadInsts.push_back(CurrentDI); + bool RootFound = false; + for (auto UserExtract : CurrentDI->users()) { // iterate over extract users of deinterleave + Instruction *Extract = dyn_cast(UserExtract); + if (!Extract || Extract->getOpcode() != Instruction::ExtractValue) + continue; + bool IsLeaf = true; + for (auto UserDI : UserExtract->users()) { // iterate over deinterleave users of extract + IntrinsicInst *Child_DI = dyn_cast(UserDI); + if (!Child_DI || + Child_DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) + continue; + IsLeaf = false; + if (mp.count(Child_DI) == 0) { + DeinterleaveTreeQueue.push(Child_DI); + } + continue; + } + if (IsLeaf) { + RootFound = true; + LeafNodes.push(std::make_pair(DILeafCount, UserExtract)); + TempDeadInsts.push_back(Extract); + } + else { + TempDeadInsts.push_back(Extract); + } + } + if (RootFound) + DILeafCount += CurrentDI->getNumUses(); + } // Try and match this with target specific intrinsics. - if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI)) + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LeafNodes, LI)) return false; // We now have a target-specific load, so delete the old one. - DeadInsts.push_back(DI); + DeadInsts.insert(DeadInsts.end(), TempDeadInsts.rbegin(), TempDeadInsts.rend()); DeadInsts.push_back(LI); return true; } @@ -531,14 +572,33 @@ bool InterleavedAccessImpl::lowerInterleaveIntrinsic( return false; LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); - + std::queue IeinterleaveTreeQueue; + std::queue LeafNodes; + SmallVector TempDeadInsts; + + IeinterleaveTreeQueue.push(II); + while(!IeinterleaveTreeQueue.empty()) { + auto node = IeinterleaveTreeQueue.front(); + TempDeadInsts.push_back(node); + IeinterleaveTreeQueue.pop(); + for(unsigned i = 0; i < 2; i++) { + auto op = node->getOperand(i); + if(auto CurrentII = dyn_cast(op)) { + if (CurrentII->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) + continue; + IeinterleaveTreeQueue.push(CurrentII); + continue; + } + LeafNodes.push(op); + } + } // Try and match this with target specific intrinsics. - if (!TLI->lowerInterleaveIntrinsicToStore(II, SI)) + if (!TLI->lowerInterleaveIntrinsicToStore(II, LeafNodes, SI)) return false; // We now have a target-specific store, so delete the old one. DeadInsts.push_back(SI); - DeadInsts.push_back(II); + DeadInsts.insert(DeadInsts.end(), TempDeadInsts.begin(), TempDeadInsts.end()); return true; } @@ -559,7 +619,7 @@ bool InterleavedAccessImpl::runOnFunction(Function &F) { // with a factor of 2. if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2) Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); - if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) + else if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) Changed |= lowerInterleaveIntrinsic(II, DeadInsts); } } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 7947d73f9a4dd..ab8c01e2df5a9 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -16345,15 +16345,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI, } bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( - IntrinsicInst *DI, LoadInst *LI) const { + IntrinsicInst *DI, std::queue>& LeafNodes, LoadInst *LI) const { // Only deinterleave2 supported at present. if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) return false; - // Only a factor of 2 supported at present. - const unsigned Factor = 2; + const unsigned Factor = std::max(2, (int)LeafNodes.size()); - VectorType *VTy = cast(DI->getType()->getContainedType(0)); + VectorType *VTy = (LeafNodes.size() > 0) ? cast(LeafNodes.front().second->getType()) : + cast(DI->getType()->getContainedType(0)); const DataLayout &DL = DI->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) @@ -16409,8 +16409,27 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( Result = Builder.CreateInsertValue(Result, Left, 0); Result = Builder.CreateInsertValue(Result, Right, 1); } else { - if (UseScalable) + if (UseScalable) { Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); + if (Factor == 2) { + DI->replaceAllUsesWith(Result); + return true; + } + while (!LeafNodes.empty()) { + unsigned ExtractIndex = LeafNodes.front().first; + llvm::Value* CurrentExtract = LeafNodes.front().second; + LeafNodes.pop(); + ExtractValueInst* ExtractValueInst = dyn_cast(CurrentExtract); + + SmallVector NewIndices; + for (auto index : ExtractValueInst->indices()) + NewIndices.push_back(index + ExtractIndex); + + Value *extrc =Builder.CreateExtractValue(Result, NewIndices); + CurrentExtract->replaceAllUsesWith(extrc); + } + return true; + } else Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); } @@ -16420,15 +16439,15 @@ bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( } bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( - IntrinsicInst *II, StoreInst *SI) const { + IntrinsicInst *II, std::queue& LeafNodes, StoreInst *SI) const { // Only interleave2 supported at present. if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) return false; - // Only a factor of 2 supported at present. - const unsigned Factor = 2; + // leaf nodes are the nodes that will be interleaved + const unsigned Factor = LeafNodes.size(); - VectorType *VTy = cast(II->getOperand(0)->getType()); + VectorType *VTy = cast(LeafNodes.front()->getType()); const DataLayout &DL = II->getModule()->getDataLayout(); bool UseScalable; if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) @@ -16473,8 +16492,16 @@ bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); } - if (UseScalable) - Builder.CreateCall(StNFunc, {L, R, Pred, Address}); + if (UseScalable) { + SmallVector Args; + while (!LeafNodes.empty()) { + Args.push_back(LeafNodes.front()); + LeafNodes.pop(); + } + Args.push_back(Pred); + Args.push_back(Address); + Builder.CreateCall(StNFunc, Args); + } else Builder.CreateCall(StNFunc, {L, R, Address}); } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index db6e8a00d2fb5..85497a1f7ae41 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -683,9 +683,11 @@ class AArch64TargetLowering : public TargetLowering { unsigned Factor) const override; bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + std::queue>& LeafNodes, LoadInst *LI) const override; bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + std::queue& LeafNodes, StoreInst *SI) const override; bool isLegalAddImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index e80931a03f30b..35150928f0adb 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -3315,15 +3315,17 @@ InstructionCost AArch64TTIImpl::getInterleavedMemoryOpCost( assert(Factor >= 2 && "Invalid interleave factor"); auto *VecVTy = cast(VecTy); - if (VecTy->isScalableTy() && (!ST->hasSVE() || Factor != 2)) - return InstructionCost::getInvalid(); + unsigned MaxFactor = TLI->getMaxSupportedInterleaveFactor(); + if (VecTy->isScalableTy() && + (!ST->hasSVE() || Factor > MaxFactor)) + return InstructionCost::getInvalid(); // Vectorization for masked interleaved accesses is only enabled for scalable // VF. if (!VecTy->isScalableTy() && (UseMaskForCond || UseMaskForGaps)) return InstructionCost::getInvalid(); - if (!UseMaskForGaps && Factor <= TLI->getMaxSupportedInterleaveFactor()) { + if (!UseMaskForGaps && Factor <= MaxFactor) { unsigned MinElts = VecVTy->getElementCount().getKnownMinValue(); auto *SubVecTy = VectorType::get(VecVTy->getElementType(), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index dc7c6f83b9857..64e0a2bb1f294 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -21025,6 +21025,7 @@ bool RISCVTargetLowering::lowerInterleavedStore(StoreInst *SI, } bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + std::queue>& LeafNodes, LoadInst *LI) const { assert(LI->isSimple()); IRBuilder<> Builder(LI); @@ -21033,10 +21034,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) return false; - unsigned Factor = 2; + unsigned Factor = std::max(2, (int)LeafNodes.size()); VectorType *VTy = cast(DI->getOperand(0)->getType()); - VectorType *ResVTy = cast(DI->getType()->getContainedType(0)); + VectorType *ResVTy = (LeafNodes.size() > 0) ? cast(LeafNodes.front().second->getType()) : + cast(DI->getType()->getContainedType(0)); if (!isLegalInterleavedAccessType(ResVTy, Factor, LI->getAlign(), LI->getPointerAddressSpace(), @@ -21064,6 +21066,27 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, {ResVTy, XLenTy}); VL = Constant::getAllOnesValue(XLenTy); Ops.append(Factor, PoisonValue::get(ResVTy)); + Ops.append({LI->getPointerOperand(), VL}); + Value *Vlseg = Builder.CreateCall(VlsegNFunc, Ops); + //----------- + if (Factor == 2) { + DI->replaceAllUsesWith(Vlseg); + return true; + } + unsigned ExtractIndex = 0; + while (!LeafNodes.empty()) { + ExtractIndex = LeafNodes.front().first; + auto CurrentExtract = LeafNodes.front().second; + LeafNodes.pop(); + ExtractValueInst* ExtractValueInst = dyn_cast(CurrentExtract); + SmallVector NewIndices; + for (auto index : ExtractValueInst->indices()) { + NewIndices.push_back(index + ExtractIndex); + } + Value *extrc = Builder.CreateExtractValue(Vlseg, NewIndices); + CurrentExtract->replaceAllUsesWith(extrc); + } + return true; } Ops.append({LI->getPointerOperand(), VL}); @@ -21075,6 +21098,7 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, } bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + std::queue& LeafNodes, StoreInst *SI) const { assert(SI->isSimple()); IRBuilder<> Builder(SI); @@ -21083,10 +21107,10 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II, if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) return false; - unsigned Factor = 2; + unsigned Factor = LeafNodes.size(); VectorType *VTy = cast(II->getType()); - VectorType *InVTy = cast(II->getOperand(0)->getType()); + VectorType *InVTy = cast(LeafNodes.front()->getType()); if (!isLegalInterleavedAccessType(InVTy, Factor, SI->getAlign(), SI->getPointerAddressSpace(), @@ -21112,6 +21136,15 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(IntrinsicInst *II, VssegNFunc = Intrinsic::getDeclaration(SI->getModule(), IntrIds[Factor - 2], {InVTy, XLenTy}); VL = Constant::getAllOnesValue(XLenTy); + SmallVector Args; + while (!LeafNodes.empty()) { + Args.push_back(LeafNodes.front()); + LeafNodes.pop(); + } + Args.push_back(SI->getPointerOperand()); + Args.push_back(VL); + Builder.CreateCall(VssegNFunc, Args); + return true; } Builder.CreateCall(VssegNFunc, {II->getOperand(0), II->getOperand(1), diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index b10da3d40befb..1f104cf3bc15d 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -855,10 +855,12 @@ class RISCVTargetLowering : public TargetLowering { bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; - bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *II, + bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + std::queue>& LeafNodes, LoadInst *LI) const override; bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + std::queue& LeafNodes, StoreInst *SI) const override; bool supportKCFIBundles() const override { return true; } diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2057cab46135f..41f8c5a72ce1e 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -154,6 +154,7 @@ #include #include #include +#include using namespace llvm; @@ -459,10 +460,23 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic( - WideVecTy, Intrinsic::experimental_vector_interleave2, Vals, - /*FMFSource=*/nullptr, Name); + SmallVector Vecs(Vals); + unsigned AllNodesNum = (2*Vals.size()) - 1; + // last element in the vec should be the final interleaved result, + // so, skip processing last element. + AllNodesNum --; + // interleave each 2 consecutive nodes, and push result to the vec, + // so that we can interleave the interleaved results again if we have + // more than 2 vectors to interleave. + for (unsigned i = 0; i < AllNodesNum; i +=2) { + VectorType *VecTy = cast(Vecs[i]->getType()); + VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); + auto InterleavedVec = Builder.CreateIntrinsic( + WideVecTy, Intrinsic::experimental_vector_interleave2, + {Vecs[i], Vecs[i+1]}, /*FMFSource=*/nullptr, Name); + Vecs.push_back(InterleavedVec); + } + return Vecs[Vecs.size()-1]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2519,7 +2533,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( unsigned Part, Value *MaskForGaps) -> Value * { if (VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *BlockInMaskPart = State.get(BlockInMask, Part); SmallVector Ops = {BlockInMaskPart, BlockInMaskPart}; @@ -2572,23 +2586,40 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( } if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && - "Unsupported deinterleave factor for scalable vectors"); - + assert(isPowerOf2_32(InterleaveFactor) && + "Unsupported deinterleave factor for scalable vectors"); for (unsigned Part = 0; Part < UF; ++Part) { // Scalable vectors cannot use arbitrary shufflevectors (only splats), // so must use intrinsics to deinterleave. - Value *DI = Builder.CreateIntrinsic( - Intrinsic::experimental_vector_deinterleave2, VecTy, NewLoads[Part], - /*FMFSource=*/nullptr, "strided.vec"); + + std::queueQueue; + Queue.push(NewLoads[Part]); + // NonLeaf represents how many times we will do deinterleaving, + // think of it as a tree, each node will be deinterleaved, untill we reach to + // the leaf nodes which will be the final results of deinterleaving. + unsigned NonLeaf = InterleaveFactor - 1; + for (unsigned i = 0; i < NonLeaf; i ++) { + auto Node = Queue.front(); + Queue.pop(); + auto DeinterleaveType = Node->getType(); + Value *DI = Builder.CreateIntrinsic( + Intrinsic::experimental_vector_deinterleave2, DeinterleaveType, Node, + /*FMFSource=*/nullptr, "root.strided.vec"); + Value *StridedVec1 = Builder.CreateExtractValue(DI, 0); + Value *StridedVec2 = Builder.CreateExtractValue(DI, 1); + Queue.push(StridedVec1); + Queue.push(StridedVec2); + } + unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { + for (unsigned I = 0; I < InterleaveFactor && !Queue.empty(); ++I) { Instruction *Member = Group->getMember(I); if (!Member) continue; - Value *StridedVec = Builder.CreateExtractValue(DI, I); + auto StridedVec = Queue.front(); + Queue.pop(); // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), VF); @@ -2681,6 +2712,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup( // Interleave all the smaller vectors into one wider vector. Value *IVec = interleaveVectors(Builder, StoredVecs, "interleaved.vec"); + //LLVM_DEBUG(dbgs() << "interleaved vec: "; IVec->dump()); Instruction *NewStoreInstr; if (BlockInMask || MaskForGaps) { Value *GroupMask = CreateGroupMask(Part, MaskForGaps); @@ -8659,9 +8691,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // is a (power of 2) factor, since we require the (de)interleave2 intrinsics instead of + // shufflevectors, so we can do (de)interleave2 recursively. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll new file mode 100644 index 0000000000000..15e8b4fee002c --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-deinterleave-load.ll @@ -0,0 +1,89 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64--linux-gnu -mattr=+sve < %s | FileCheck %s + +%struct.xyzt = type { i32, i32, i32, i32 } + +define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LABEL: loop_xyzt: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cntw x10 +; CHECK-NEXT: mov x8, xzr +; CHECK-NEXT: mov w9, #1024 // =0x400 +; CHECK-NEXT: neg x10, x10 +; CHECK-NEXT: rdvl x11, #4 +; CHECK-NEXT: .LBB0_1: // %vector.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add x12, x1, x8 +; CHECK-NEXT: adds x9, x9, x10 +; CHECK-NEXT: ld4w { z0.s - z3.s }, p0/z, [x12] +; CHECK-NEXT: add x12, x2, x8 +; CHECK-NEXT: ld4w { z4.s - z7.s }, p0/z, [x12] +; CHECK-NEXT: add x12, x0, x8 +; CHECK-NEXT: add x8, x8, x11 +; CHECK-NEXT: add z16.s, z4.s, z0.s +; CHECK-NEXT: sub z17.s, z1.s, z5.s +; CHECK-NEXT: movprfx z18, z2 +; CHECK-NEXT: lsl z18.s, p0/m, z18.s, z6.s +; CHECK-NEXT: movprfx z19, z3 +; CHECK-NEXT: asr z19.s, p0/m, z19.s, z7.s +; CHECK-NEXT: st4w { z16.s - z19.s }, p0, [x12] +; CHECK-NEXT: b.ne .LBB0_1 +; CHECK-NEXT: // %bb.2: // %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index + %wide.vec = load , ptr %2, align 4 + %root.strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %wide.vec) + %3 = extractvalue { , } %root.strided.vec, 0 + %4 = extractvalue { , } %root.strided.vec, 1 + %root.strided.vec55 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %3) + %5 = extractvalue { , } %root.strided.vec55, 0 + %6 = extractvalue { , } %root.strided.vec55, 1 + %root.strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %4) + %7 = extractvalue { , } %root.strided.vec56, 0 + %8 = extractvalue { , } %root.strided.vec56, 1 + %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index + %wide.vec57 = load , ptr %9, align 4 + %root.strided.vec58 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %wide.vec57) + %10 = extractvalue { , } %root.strided.vec58, 0 + %11 = extractvalue { , } %root.strided.vec58, 1 + %root.strided.vec59 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %10) + %12 = extractvalue { , } %root.strided.vec59, 0 + %13 = extractvalue { , } %root.strided.vec59, 1 + %root.strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %11) + %14 = extractvalue { , } %root.strided.vec60, 0 + %15 = extractvalue { , } %root.strided.vec60, 1 + %16 = add nsw %12, %5 + %17 = sub nsw %6, %13 + %18 = shl %7, %14 + %19 = ashr %8, %15 + %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i32( %16, %17) + %interleaved.vec61 = tail call @llvm.experimental.vector.interleave2.nxv8i32( %18, %19) + %interleaved.vec62 = tail call @llvm.experimental.vector.interleave2.nxv16i32( %interleaved.vec, %interleaved.vec61) + store %interleaved.vec62, ptr %20, align 4 + %index.next = add nuw i64 %index, %1 + %21 = icmp eq i64 %index.next, 1024 + br i1 %21, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +} + +define {, } @vector_deinterleave_load_nxv4i32_nxv8i32(ptr %p) { +; CHECK-LABEL: vector_deinterleave_load_nxv4i32_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld2w { z0.s, z1.s }, p0/z, [x0] +; CHECK-NEXT: ret + %vec = load , ptr %p + %retval = call {, } @llvm.experimental.vector.deinterleave2.nxv8i32( %vec) + ret {, } %retval +} diff --git a/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll new file mode 100644 index 0000000000000..aaad97a9014b6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/sve-deinterleave-load.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc < %s -mtriple=riscv64 -mattr=+v,+zfh,+zvfh | FileCheck %s + +%struct.xyzt = type { i32, i32, i32, i32 } + +define dso_local void @loop_xyzt(ptr noalias nocapture noundef writeonly %dst, ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b) { +; CHECK-LABEL: loop_xyzt: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: srli a3, a4, 1 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: li a5, 1024 +; CHECK-NEXT: vsetvli a6, zero, e32, m2, ta, ma +; CHECK-NEXT: .LBB0_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vlseg4e32.v v8, (a1) +; CHECK-NEXT: vlseg4e32.v v16, (a2) +; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vsub.vv v10, v10, v18 +; CHECK-NEXT: vsll.vv v12, v12, v20 +; CHECK-NEXT: vsra.vv v14, v14, v22 +; CHECK-NEXT: vsseg4e32.v v8, (a0) +; CHECK-NEXT: sub a5, a5, a3 +; CHECK-NEXT: add a0, a0, a4 +; CHECK-NEXT: add a2, a2, a4 +; CHECK-NEXT: add a1, a1, a4 +; CHECK-NEXT: bnez a5, .LBB0_1 +; CHECK-NEXT: # %bb.2: # %for.cond.cleanup +; CHECK-NEXT: ret +entry: + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 2 + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %2 = getelementptr inbounds %struct.xyzt, ptr %a, i64 %index + %wide.vec = load , ptr %2, align 4 + %root.strided.vec = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %wide.vec) + %3 = extractvalue { , } %root.strided.vec, 0 + %4 = extractvalue { , } %root.strided.vec, 1 + %root.strided.vec55 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %3) + %5 = extractvalue { , } %root.strided.vec55, 0 + %6 = extractvalue { , } %root.strided.vec55, 1 + %root.strided.vec56 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %4) + %7 = extractvalue { , } %root.strided.vec56, 0 + %8 = extractvalue { , } %root.strided.vec56, 1 + %9 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %index + %wide.vec57 = load , ptr %9, align 4 + %root.strided.vec58 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i32( %wide.vec57) + %10 = extractvalue { , } %root.strided.vec58, 0 + %11 = extractvalue { , } %root.strided.vec58, 1 + %root.strided.vec59 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %10) + %12 = extractvalue { , } %root.strided.vec59, 0 + %13 = extractvalue { , } %root.strided.vec59, 1 + %root.strided.vec60 = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %11) + %14 = extractvalue { , } %root.strided.vec60, 0 + %15 = extractvalue { , } %root.strided.vec60, 1 + %16 = add nsw %12, %5 + %17 = sub nsw %6, %13 + %18 = shl %7, %14 + %19 = ashr %8, %15 + %20 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %index + %interleaved.vec = tail call @llvm.experimental.vector.interleave2.nxv8i32( %16, %17) + %interleaved.vec61 = tail call @llvm.experimental.vector.interleave2.nxv8i32( %18, %19) + %interleaved.vec62 = tail call @llvm.experimental.vector.interleave2.nxv16i32( %interleaved.vec, %interleaved.vec61) + store %interleaved.vec62, ptr %20, align 4 + %index.next = add nuw i64 %index, %1 + %21 = icmp eq i64 %index.next, 1024 + br i1 %21, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body + ret void +}