diff --git a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h b/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h deleted file mode 100644 index 3178dc762a195..0000000000000 --- a/llvm/include/llvm/Transforms/Vectorize/EVLIndVarSimplify.h +++ /dev/null @@ -1,31 +0,0 @@ -//===------ EVLIndVarSimplify.h - Optimize vectorized loops w/ EVL IV------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass optimizes a vectorized loop with canonical IV to using EVL-based -// IV if it was tail-folded by predicated EVL. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H -#define LLVM_TRANSFORMS_VECTORIZE_EVLINDVARSIMPLIFY_H - -#include "llvm/Analysis/LoopAnalysisManager.h" -#include "llvm/IR/PassManager.h" - -namespace llvm { -class Loop; -class LPMUpdater; - -/// Turn vectorized loops with canonical induction variables into loops that -/// only use a single EVL-based induction variable. -struct EVLIndVarSimplifyPass : public PassInfoMixin { - PreservedAnalyses run(Loop &L, LoopAnalysisManager &LAM, - LoopStandardAnalysisResults &AR, LPMUpdater &U); -}; -} // namespace llvm -#endif diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index e4dab4acc0b4a..f84a16bd97224 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -375,7 +375,6 @@ #include "llvm/Transforms/Utils/SymbolRewriter.h" #include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h" #include "llvm/Transforms/Utils/UnifyLoopExits.h" -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h" #include "llvm/Transforms/Vectorize/LoopIdiomVectorize.h" #include "llvm/Transforms/Vectorize/LoopVectorize.h" diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index 49d5d08474f0f..f0e7d36f78aab 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -755,7 +755,6 @@ LOOP_ANALYSIS("should-run-extra-simple-loop-unswitch", #endif LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass()) LOOP_PASS("dot-ddg", DDGDotPrinterPass()) -LOOP_PASS("evl-iv-simplify", EVLIndVarSimplifyPass()) LOOP_PASS("guard-widening", GuardWideningPass()) LOOP_PASS("extra-simple-loop-unswitch-passes", ExtraLoopPassManager()) diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 96670fe3ea195..9f4a242214471 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -1,5 +1,4 @@ add_llvm_component_library(LLVMVectorize - EVLIndVarSimplify.cpp LoadStoreVectorizer.cpp LoopIdiomVectorize.cpp LoopVectorizationLegality.cpp diff --git a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp b/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp deleted file mode 100644 index 5dd689799b828..0000000000000 --- a/llvm/lib/Transforms/Vectorize/EVLIndVarSimplify.cpp +++ /dev/null @@ -1,300 +0,0 @@ -//===---- EVLIndVarSimplify.cpp - Optimize vectorized loops w/ EVL IV------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This pass optimizes a vectorized loop with canonical IV to using EVL-based -// IV if it was tail-folded by predicated EVL. -// -//===----------------------------------------------------------------------===// - -#include "llvm/Transforms/Vectorize/EVLIndVarSimplify.h" -#include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopPass.h" -#include "llvm/Analysis/OptimizationRemarkEmitter.h" -#include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" -#include "llvm/Analysis/ValueTracking.h" -#include "llvm/IR/IRBuilder.h" -#include "llvm/IR/PatternMatch.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/Debug.h" -#include "llvm/Support/MathExtras.h" -#include "llvm/Support/raw_ostream.h" -#include "llvm/Transforms/Scalar/LoopPassManager.h" -#include "llvm/Transforms/Utils/Local.h" - -#define DEBUG_TYPE "evl-iv-simplify" - -using namespace llvm; - -STATISTIC(NumEliminatedCanonicalIV, "Number of canonical IVs we eliminated"); - -static cl::opt EnableEVLIndVarSimplify( - "enable-evl-indvar-simplify", - cl::desc("Enable EVL-based induction variable simplify Pass"), cl::Hidden, - cl::init(true)); - -namespace { -struct EVLIndVarSimplifyImpl { - ScalarEvolution &SE; - OptimizationRemarkEmitter *ORE = nullptr; - - EVLIndVarSimplifyImpl(LoopStandardAnalysisResults &LAR, - OptimizationRemarkEmitter *ORE) - : SE(LAR.SE), ORE(ORE) {} - - /// Returns true if modify the loop. - bool run(Loop &L); -}; -} // anonymous namespace - -/// Returns the constant part of vectorization factor from the induction -/// variable's step value SCEV expression. -static uint32_t getVFFromIndVar(const SCEV *Step, const Function &F) { - if (!Step) - return 0U; - - // Looking for loops with IV step value in the form of `( x - // vscale)`. - if (const auto *Mul = dyn_cast(Step)) { - if (Mul->getNumOperands() == 2) { - const SCEV *LHS = Mul->getOperand(0); - const SCEV *RHS = Mul->getOperand(1); - if (const auto *Const = dyn_cast(LHS); - Const && isa(RHS)) { - uint64_t V = Const->getAPInt().getLimitedValue(); - if (llvm::isUInt<32>(V)) - return V; - } - } - } - - // If not, see if the vscale_range of the parent function is a fixed value, - // which makes the step value to be replaced by a constant. - if (F.hasFnAttribute(Attribute::VScaleRange)) - if (const auto *ConstStep = dyn_cast(Step)) { - APInt V = ConstStep->getAPInt().abs(); - ConstantRange CR = llvm::getVScaleRange(&F, 64); - if (const APInt *Fixed = CR.getSingleElement()) { - V = V.zextOrTrunc(Fixed->getBitWidth()); - uint64_t VF = V.udiv(*Fixed).getLimitedValue(); - if (VF && llvm::isUInt<32>(VF) && - // Make sure step is divisible by vscale. - V.urem(*Fixed).isZero()) - return VF; - } - } - - return 0U; -} - -bool EVLIndVarSimplifyImpl::run(Loop &L) { - if (!EnableEVLIndVarSimplify) - return false; - - if (!getBooleanLoopAttribute(&L, "llvm.loop.isvectorized")) - return false; - const MDOperand *EVLMD = - findStringMetadataForLoop(&L, "llvm.loop.isvectorized.tailfoldingstyle") - .value_or(nullptr); - if (!EVLMD || !EVLMD->equalsStr("evl")) - return false; - - BasicBlock *LatchBlock = L.getLoopLatch(); - ICmpInst *OrigLatchCmp = L.getLatchCmpInst(); - if (!LatchBlock || !OrigLatchCmp) - return false; - - InductionDescriptor IVD; - PHINode *IndVar = L.getInductionVariable(SE); - if (!IndVar || !L.getInductionDescriptor(SE, IVD)) { - const char *Reason = (IndVar ? "induction descriptor is not available" - : "cannot recognize induction variable"); - LLVM_DEBUG(dbgs() << "Cannot retrieve IV from loop " << L.getName() - << " because" << Reason << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", - L.getStartLoc(), L.getHeader()) - << "Cannot retrieve IV because " << ore::NV("Reason", Reason); - }); - } - return false; - } - - BasicBlock *InitBlock, *BackEdgeBlock; - if (!L.getIncomingAndBackEdge(InitBlock, BackEdgeBlock)) { - LLVM_DEBUG(dbgs() << "Expect unique incoming and backedge in " - << L.getName() << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", - L.getStartLoc(), L.getHeader()) - << "Does not have a unique incoming and backedge"; - }); - } - return false; - } - - // Retrieve the loop bounds. - std::optional Bounds = L.getBounds(SE); - if (!Bounds) { - LLVM_DEBUG(dbgs() << "Could not obtain the bounds for loop " << L.getName() - << "\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedLoopStructure", - L.getStartLoc(), L.getHeader()) - << "Could not obtain the loop bounds"; - }); - } - return false; - } - Value *CanonicalIVInit = &Bounds->getInitialIVValue(); - Value *CanonicalIVFinal = &Bounds->getFinalIVValue(); - - const SCEV *StepV = IVD.getStep(); - uint32_t VF = getVFFromIndVar(StepV, *L.getHeader()->getParent()); - if (!VF) { - LLVM_DEBUG(dbgs() << "Could not infer VF from IndVar step '" << *StepV - << "'\n"); - if (ORE) { - ORE->emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "UnrecognizedIndVar", - L.getStartLoc(), L.getHeader()) - << "Could not infer VF from IndVar step " - << ore::NV("Step", StepV); - }); - } - return false; - } - LLVM_DEBUG(dbgs() << "Using VF=" << VF << " for loop " << L.getName() - << "\n"); - - // Try to find the EVL-based induction variable. - using namespace PatternMatch; - BasicBlock *BB = IndVar->getParent(); - - Value *EVLIndVar = nullptr; - Value *RemTC = nullptr; - Value *TC = nullptr; - auto IntrinsicMatch = m_Intrinsic( - m_Value(RemTC), m_SpecificInt(VF), - /*Scalable=*/m_SpecificInt(1)); - for (PHINode &PN : BB->phis()) { - if (&PN == IndVar) - continue; - - // Check 1: it has to contain both incoming (init) & backedge blocks - // from IndVar. - if (PN.getBasicBlockIndex(InitBlock) < 0 || - PN.getBasicBlockIndex(BackEdgeBlock) < 0) - continue; - // Check 2: EVL index is always increasing, thus its inital value has to be - // equal to either the initial IV value (when the canonical IV is also - // increasing) or the last IV value (when canonical IV is decreasing). - Value *Init = PN.getIncomingValueForBlock(InitBlock); - using Direction = Loop::LoopBounds::Direction; - switch (Bounds->getDirection()) { - case Direction::Increasing: - if (Init != CanonicalIVInit) - continue; - break; - case Direction::Decreasing: - if (Init != CanonicalIVFinal) - continue; - break; - case Direction::Unknown: - // To be more permissive and see if either the initial or final IV value - // matches PN's init value. - if (Init != CanonicalIVInit && Init != CanonicalIVFinal) - continue; - break; - } - Value *RecValue = PN.getIncomingValueForBlock(BackEdgeBlock); - assert(RecValue && "expect recurrent IndVar value"); - - LLVM_DEBUG(dbgs() << "Found candidate PN of EVL-based IndVar: " << PN - << "\n"); - - // Check 3: Pattern match to find the EVL-based index and total trip count - // (TC). - if (match(RecValue, - m_c_Add(m_ZExtOrSelf(IntrinsicMatch), m_Specific(&PN))) && - match(RemTC, m_Sub(m_Value(TC), m_Specific(&PN)))) { - EVLIndVar = RecValue; - break; - } - } - - if (!EVLIndVar || !TC) - return false; - - LLVM_DEBUG(dbgs() << "Using " << *EVLIndVar << " for EVL-based IndVar\n"); - if (ORE) { - ORE->emit([&]() { - DebugLoc DL; - BasicBlock *Region = nullptr; - if (auto *I = dyn_cast(EVLIndVar)) { - DL = I->getDebugLoc(); - Region = I->getParent(); - } else { - DL = L.getStartLoc(); - Region = L.getHeader(); - } - return OptimizationRemark(DEBUG_TYPE, "UseEVLIndVar", DL, Region) - << "Using " << ore::NV("EVLIndVar", EVLIndVar) - << " for EVL-based IndVar"; - }); - } - - // Create an EVL-based comparison and replace the branch to use it as - // predicate. - - // Loop::getLatchCmpInst check at the beginning of this function has ensured - // that latch block ends in a conditional branch. - auto *LatchBranch = cast(LatchBlock->getTerminator()); - assert(LatchBranch->isConditional() && - "expect the loop latch to be ended with a conditional branch"); - ICmpInst::Predicate Pred; - if (LatchBranch->getSuccessor(0) == L.getHeader()) - Pred = ICmpInst::ICMP_NE; - else - Pred = ICmpInst::ICMP_EQ; - - IRBuilder<> Builder(OrigLatchCmp); - auto *NewLatchCmp = Builder.CreateICmp(Pred, EVLIndVar, TC); - OrigLatchCmp->replaceAllUsesWith(NewLatchCmp); - - // llvm::RecursivelyDeleteDeadPHINode only deletes cycles whose values are - // not used outside the cycles. However, in this case the now-RAUW-ed - // OrigLatchCmp will be considered a use outside the cycle while in reality - // it's practically dead. Thus we need to remove it before calling - // RecursivelyDeleteDeadPHINode. - (void)RecursivelyDeleteTriviallyDeadInstructions(OrigLatchCmp); - if (llvm::RecursivelyDeleteDeadPHINode(IndVar)) - LLVM_DEBUG(dbgs() << "Removed original IndVar\n"); - - ++NumEliminatedCanonicalIV; - - return true; -} - -PreservedAnalyses EVLIndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &LAM, - LoopStandardAnalysisResults &AR, - LPMUpdater &U) { - Function &F = *L.getHeader()->getParent(); - auto &FAMProxy = LAM.getResult(L, AR); - OptimizationRemarkEmitter *ORE = - FAMProxy.getCachedResult(F); - - if (EVLIndVarSimplifyImpl(AR, ORE).run(L)) - return PreservedAnalyses::allInSet(); - return PreservedAnalyses::all(); -} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll b/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll deleted file mode 100644 index 4de0e666149f3..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/RISCV/evl-iv-simplify.ll +++ /dev/null @@ -1,333 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify)' < %s | FileCheck %s -; RUN: opt -S -mtriple=riscv64 -mattr='+v' --passes='loop(evl-iv-simplify),function(simplifycfg,dce)' < %s | FileCheck %s --check-prefix=LOOP-DEL - -define void @simple(ptr noalias %a, ptr noalias %b, %c, i64 %N) vscale_range(2, 1024) { -; CHECK-LABEL: define void @simple( -; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] -; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; CHECK-NEXT: br i1 [[TMP3]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 -; CHECK-NEXT: [[TMP8:%.*]] = sub i64 [[TMP7]], 1 -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[N]], [[TMP8]] -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP5]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP11:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP12:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP11]], i32 4, i1 true) -; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; CHECK-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP17]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP18:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP13]] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i32 0 -; CHECK-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP18]], ptr align 4 [[TMP20]], splat (i1 true), i32 [[TMP12]]) -; CHECK-NEXT: [[TMP21:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP21]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @simple( -; LOOP-DEL-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 -1, [[N]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() -; LOOP-DEL-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 4 -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_BODY:%.*]], label [[VECTOR_PH:%.*]] -; LOOP-DEL: vector.ph: -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP4:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP5:%.*]] = call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP4]], i32 4, i1 true) -; LOOP-DEL-NEXT: [[TMP6:%.*]] = add i64 [[EVL_BASED_IV]], 0 -; LOOP-DEL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP6]] -; LOOP-DEL-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; LOOP-DEL-NEXT: [[VP_OP_LOAD1:%.*]] = call @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP10]], splat (i1 true), i32 [[TMP5]]) -; LOOP-DEL-NEXT: [[TMP11:%.*]] = add nsw [[C]], [[VP_OP_LOAD1]] -; LOOP-DEL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] -; LOOP-DEL-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 0 -; LOOP-DEL-NEXT: call void @llvm.vp.store.nxv4i32.p0( [[TMP11]], ptr align 4 [[TMP13]], splat (i1 true), i32 [[TMP5]]) -; LOOP-DEL-NEXT: [[TMP14:%.*]] = zext i32 [[TMP5]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[TMP14]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; LOOP-DEL: for.body: -; LOOP-DEL-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; LOOP-DEL-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]] -; LOOP-DEL-NEXT: [[ADD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; LOOP-DEL-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IV]] -; LOOP-DEL-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX4]], align 4 -; LOOP-DEL-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 -; LOOP-DEL-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; LOOP-DEL: for.cond.cleanup: -; LOOP-DEL-NEXT: ret void -; -entry: - %0 = sub i64 -1, %N - %1 = call i64 @llvm.vscale.i64() - %2 = mul i64 %1, 4 - %3 = icmp ult i64 %0, %2 - br i1 %3, label %scalar.ph, label %vector.ph - -vector.ph: ; preds = %entry - %4 = call i64 @llvm.vscale.i64() - %5 = mul i64 %4, 4 - %6 = call i64 @llvm.vscale.i64() - %7 = mul i64 %6, 4 - %8 = sub i64 %7, 1 - %n.rnd.up = add i64 %N, %8 - %n.mod.vf = urem i64 %n.rnd.up, %5 - %n.vec = sub i64 %n.rnd.up, %n.mod.vf - %9 = call i64 @llvm.vscale.i64() - %10 = mul i64 %9, 4 - br label %vector.body - -vector.body: ; preds = %vector.body, %vector.ph - %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %11 = sub i64 %N, %evl.based.iv - %12 = call i32 @llvm.experimental.get.vector.length.i64(i64 %11, i32 4, i1 true) - %13 = add i64 %evl.based.iv, 0 - %14 = getelementptr inbounds i32, ptr %b, i64 %13 - %15 = getelementptr inbounds i32, ptr %14, i32 0 - %vp.op.load = call @llvm.vp.load.nxv4i32.p0(ptr align 4 %15, splat (i1 true), i32 %12) - %18 = add nsw %c, %vp.op.load - %19 = getelementptr inbounds i32, ptr %a, i64 %13 - %20 = getelementptr inbounds i32, ptr %19, i32 0 - call void @llvm.vp.store.nxv4i32.p0( %18, ptr align 4 %20, splat (i1 true), i32 %12) - %21 = zext i32 %12 to i64 - %index.evl.next = add i64 %21, %evl.based.iv - %index.next = add nuw i64 %index, %10 - %22 = icmp eq i64 %index.next, %n.vec - br i1 %22, label %middle.block, label %vector.body, !llvm.loop !0 - -middle.block: ; preds = %vector.body - br i1 true, label %for.cond.cleanup, label %scalar.ph - -scalar.ph: ; preds = %entry, %middle.block - %bc.resume.val = phi i64 [ %n.vec, %middle.block ], [ 0, %entry ] - br label %for.body - -for.body: ; preds = %for.body, %scalar.ph - %iv = phi i64 [ %bc.resume.val, %scalar.ph ], [ %iv.next, %for.body ] - %arrayidx = getelementptr inbounds i32, ptr %b, i64 %iv - %23 = load i32, ptr %arrayidx, align 4 - %arrayidx4 = getelementptr inbounds i32, ptr %a, i64 %iv - store i32 %23, ptr %arrayidx4, align 4 - %iv.next = add nuw nsw i64 %iv, 1 - %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !llvm.loop !3 - -for.cond.cleanup: ; preds = %middle.block, %for.body - ret void -} - -; Fixed IV steps resulting from vscale_range with a single element - -define void @fixed_iv_step(ptr %arg0, ptr %arg1, i64 %N) #0 { -; CHECK-LABEL: define void @fixed_iv_step( -; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; CHECK: for.end.loopexit5: -; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @fixed_iv_step( -; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], i64 [[N:%.*]]) #[[ATTR1:[0-9]+]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 [[N]], [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], [[N]] -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; LOOP-DEL: for.end: -; LOOP-DEL-NEXT: ret void -; -entry: - br label %vector.ph - -vector.ph: - %n.rnd.up = add nsw i64 %N, 15 - %n.vec = and i64 %n.rnd.up, -16 - %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 - %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer - br label %vector.body - -vector.body: - %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %41 = sub i64 %N, %evl.based.iv - %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) - %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv - tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) - %43 = zext i32 %42 to i64 - %index.evl.next = add i64 %evl.based.iv, %43 - %lsr.iv.next33 = add i64 %lsr.iv32, -16 - %44 = icmp eq i64 %lsr.iv.next33, 0 - br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 - -for.end.loopexit5: - br label %for.end - -for.end: - ret void -} - -; Fixed IV step and trip count -define void @fixed_iv_step_tc(ptr %arg0, ptr %arg1) #0 { -; CHECK-LABEL: define void @fixed_iv_step_tc( -; CHECK-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] -; CHECK-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; CHECK-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; CHECK-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; CHECK-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; CHECK-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 -; CHECK-NEXT: br i1 [[TMP3]], label [[FOR_END_LOOPEXIT5:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; CHECK: for.end.loopexit5: -; CHECK-NEXT: br label [[FOR_END:%.*]] -; CHECK: for.end: -; CHECK-NEXT: ret void -; -; LOOP-DEL-LABEL: define void @fixed_iv_step_tc( -; LOOP-DEL-SAME: ptr [[ARG0:%.*]], ptr [[ARG1:%.*]]) #[[ATTR1]] { -; LOOP-DEL-NEXT: entry: -; LOOP-DEL-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, ptr [[ARG0]], i64 0 -; LOOP-DEL-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; LOOP-DEL-NEXT: br label [[VECTOR_BODY:%.*]] -; LOOP-DEL: vector.body: -; LOOP-DEL-NEXT: [[EVL_BASED_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_EVL_NEXT:%.*]], [[VECTOR_BODY]] ] -; LOOP-DEL-NEXT: [[TMP0:%.*]] = sub i64 87, [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: [[TMP1:%.*]] = tail call i32 @llvm.experimental.get.vector.length.i64(i64 [[TMP0]], i32 2, i1 true) -; LOOP-DEL-NEXT: [[GEP:%.*]] = getelementptr ptr, ptr [[ARG1]], i64 [[EVL_BASED_IV]] -; LOOP-DEL-NEXT: tail call void @llvm.vp.store.nxv2p0.p0( [[BROADCAST_SPLAT]], ptr align 8 [[GEP]], splat (i1 true), i32 [[TMP1]]) -; LOOP-DEL-NEXT: [[TMP2:%.*]] = zext i32 [[TMP1]] to i64 -; LOOP-DEL-NEXT: [[INDEX_EVL_NEXT]] = add i64 [[EVL_BASED_IV]], [[TMP2]] -; LOOP-DEL-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_EVL_NEXT]], 87 -; LOOP-DEL-NEXT: br i1 [[TMP3]], label [[FOR_END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4]] -; LOOP-DEL: for.end: -; LOOP-DEL-NEXT: ret void -; -entry: - br label %vector.ph - -vector.ph: - %n.rnd.up = add nsw i64 87, 15 - %n.vec = and i64 %n.rnd.up, -16 - %broadcast.splatinsert = insertelement poison, ptr %arg0, i64 0 - %broadcast.splat = shufflevector %broadcast.splatinsert, poison, zeroinitializer - br label %vector.body - -vector.body: - %lsr.iv32 = phi i64 [ %lsr.iv.next33, %vector.body ], [ %n.vec, %vector.ph ] - %evl.based.iv = phi i64 [ 0, %vector.ph ], [ %index.evl.next, %vector.body ] - %41 = sub i64 87, %evl.based.iv - %42 = tail call i32 @llvm.experimental.get.vector.length.i64(i64 %41, i32 2, i1 true) - %gep = getelementptr ptr, ptr %arg1, i64 %evl.based.iv - tail call void @llvm.vp.store.nxv2p0.p0( %broadcast.splat, ptr align 8 %gep, splat (i1 true), i32 %42) - %43 = zext i32 %42 to i64 - %index.evl.next = add i64 %evl.based.iv, %43 - %lsr.iv.next33 = add i64 %lsr.iv32, -16 - %44 = icmp eq i64 %lsr.iv.next33, 0 - br i1 %44, label %for.end.loopexit5, label %vector.body, !llvm.loop !3 - -for.end.loopexit5: - br label %for.end - -for.end: - ret void -} - -declare i64 @llvm.vscale.i64() - -declare i32 @llvm.experimental.get.vector.length.i64(i64, i32 immarg, i1 immarg) - -declare @llvm.vp.load.nxv4i32.p0(ptr nocapture, , i32) - -declare void @llvm.vp.store.nxv4i32.p0(, ptr nocapture, , i32) - -attributes #0 = { vscale_range(8,8) } - -!0 = distinct !{!0, !1, !2, !4} -!1 = !{!"llvm.loop.isvectorized", i32 1} -!2 = !{!"llvm.loop.unroll.runtime.disable"} -!3 = distinct !{!3, !2, !1, !4} -!4 = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} -;. -; LOOP-DEL: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]], [[META3:![0-9]+]]} -; LOOP-DEL: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; LOOP-DEL: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; LOOP-DEL: [[META3]] = !{!"llvm.loop.isvectorized.tailfoldingstyle", !"evl"} -; LOOP-DEL: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]], [[META3]]} -;.