diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 5234ef8788d9e..2acd0117decfb 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1672,6 +1672,9 @@ class TargetTransformInfo { /// \return The maximum number of function arguments the target supports. unsigned getMaxNumArgs() const; + /// \returns Whether vector operations are a good candidate for vector widen. + bool considerToWiden(LLVMContext &Context, ArrayRef IL) const; + /// @} private: @@ -2041,6 +2044,8 @@ class TargetTransformInfo::Concept { getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0; virtual bool hasArmWideBranch(bool Thumb) const = 0; virtual unsigned getMaxNumArgs() const = 0; + virtual bool considerToWiden(LLVMContext &Context, + ArrayRef IL) const = 0; }; template @@ -2757,6 +2762,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { unsigned getMaxNumArgs() const override { return Impl.getMaxNumArgs(); } + + bool considerToWiden(LLVMContext &Context, + ArrayRef IL) const override { + return Impl.considerToWiden(Context, IL); + } }; template diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index c1ff314ae51c9..5278c9d4dc8c3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -895,6 +895,10 @@ class TargetTransformInfoImplBase { unsigned getMaxNumArgs() const { return UINT_MAX; } + bool considerToWiden(LLVMContext &Context, ArrayRef IL) const { + return false; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h new file mode 100644 index 0000000000000..6988785a92ce0 --- /dev/null +++ b/llvm/include/llvm/Transforms/Vectorize/VectorWiden.h @@ -0,0 +1,25 @@ +//===--- VectorWiden.h - Combining Vector Operations to wider types ---===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===---------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H +#define LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class VectorWidenPass : public PassInfoMixin { +public: + VectorWidenPass() {} + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; + +} // namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VECTORWIDENING_H diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index aad14f21d1146..d01ed739a5a71 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -1248,6 +1248,11 @@ bool TargetTransformInfo::hasActiveVectorLength(unsigned Opcode, Type *DataType, return TTIImpl->hasActiveVectorLength(Opcode, DataType, Alignment); } +bool TargetTransformInfo::considerToWiden(LLVMContext &Context, + ArrayRef IL) const { + return TTIImpl->considerToWiden(Context, IL); +} + TargetTransformInfo::Concept::~Concept() = default; TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp index 985ff88139323..8ef983e5d8651 100644 --- a/llvm/lib/Passes/PassBuilder.cpp +++ b/llvm/lib/Passes/PassBuilder.cpp @@ -233,8 +233,8 @@ #include "llvm/Transforms/Utils/CanonicalizeAliases.h" #include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h" #include "llvm/Transforms/Utils/CountVisits.h" -#include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/DXILUpgrade.h" +#include "llvm/Transforms/Utils/Debugify.h" #include "llvm/Transforms/Utils/EntryExitInstrumenter.h" #include "llvm/Transforms/Utils/FixIrreducible.h" #include "llvm/Transforms/Utils/HelloWorld.h" @@ -263,6 +263,7 @@ #include "llvm/Transforms/Vectorize/LoopVectorize.h" #include "llvm/Transforms/Vectorize/SLPVectorizer.h" #include "llvm/Transforms/Vectorize/VectorCombine.h" +#include "llvm/Transforms/Vectorize/VectorWiden.h" #include using namespace llvm; diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def index df9f14920f291..2eef2f0a22d95 100644 --- a/llvm/lib/Passes/PassRegistry.def +++ b/llvm/lib/Passes/PassRegistry.def @@ -428,6 +428,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("typepromotion", TypePromotionPass(TM)) FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass()) FUNCTION_PASS("vector-combine", VectorCombinePass()) +FUNCTION_PASS("vector-widen", VectorWidenPass()) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index cded28054f592..cecf02ee250b0 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -2426,6 +2426,14 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, CostKind, I)); } + static const TypeConversionCostTblEntry SME2Tbl[] = { + {ISD::FP_ROUND, MVT::nxv8f16, MVT::nxv8f32, 1}}; + + if (ST->hasSME2()) + if (const auto *Entry = ConvertCostTableLookup( + SME2Tbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) + return AdjustCost(Entry->Cost); + if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT())) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h index a6baade412c77..9afba1ec17ab7 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -412,6 +412,26 @@ class AArch64TTIImpl : public BasicTTIImplBase { return BaseT::getStoreMinimumVF(VF, ScalarMemTy, ScalarValTy); } + + bool considerToWiden(LLVMContext &Context, ArrayRef IL) const { + unsigned Opcode = IL[0]->getOpcode(); + Type *Ty = IL[0]->getType(); + if (!ST->hasSME2()) + return false; + if (llvm::any_of(IL, [Opcode, Ty](Instruction *I) { + return (Opcode != I->getOpcode() || Ty != I->getType()); + })) + return false; + if (Opcode == Instruction::FPTrunc && + Ty == ScalableVectorType::get(Type::getHalfTy(Context), 4)) + return true; + if (Opcode == Instruction::Add && + Ty == ScalableVectorType::get(Type::getInt32Ty(Context), 4) && + (IL[0]->getOperand(1) == IL[1]->getOperand(1) || + IL[0]->getOperand(0) == IL[1]->getOperand(0))) + return true; + return false; + } }; } // end namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt index 998dfd956575d..a1537bb1ffa63 100644 --- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt +++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_component_library(LLVMVectorize SLPVectorizer.cpp Vectorize.cpp VectorCombine.cpp + VectorWiden.cpp VPlan.cpp VPlanHCFGBuilder.cpp VPlanRecipes.cpp diff --git a/llvm/lib/Transforms/Vectorize/VectorWiden.cpp b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp new file mode 100644 index 0000000000000..2b7e7eaa77840 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VectorWiden.cpp @@ -0,0 +1,429 @@ +//===--- VectorWiden.cpp - Combining Vector Operations to wider types ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This pass tries to widen vector operations to a wider type, it finds +// independent from each other operations with a certain vector type as SLP does +// with scalars by Bottom Up. It detects consecutive stores that can be put +// together into a wider vector-stores. Next, it attempts to construct +// vectorizable tree using the use-def chains. +// +//==------------------------------------------------------------------------==// + +#include "llvm/Transforms/Vectorize/VectorWiden.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/Local.h" +#include "llvm/Transforms/Utils/LoopUtils.h" + +using namespace llvm; + +#define DEBUG_TYPE "vector-widen" + +// Due to independant operations to widening that we consider with possibility +// to merge those operations into one and also to widening store if we find +// later store instructions. We have to consider the distance between those +// independent operations or we might introduce bad register pressure, etc. + +static cl::opt + MaxInstDistance("vw-max-instr-distance", cl::init(30), cl::Hidden, + cl::desc("Maximum distance between instructions to" + "consider to widen")); + +static cl::opt OverrideTargetConsiderToWiden( + "vw-override-target-consider-to-widen", cl::init(false), cl::Hidden, + cl::desc("Ignore any target information while considoring to widen")); + +namespace { +class VectorWiden { +public: + using InstrList = SmallVector; + using ValueList = SmallVector; + VectorWiden(Function &F, const TargetTransformInfo &TTI) + : F(F), Builder(F.getContext()), TTI(TTI) {} + + bool run(); + +private: + Function &F; + IRBuilder<> Builder; + const TargetTransformInfo &TTI; + TargetLibraryInfo *TLI; + + DenseSet DeletedInstructions; + + /// Checks if the instruction is marked for deletion. + bool isDeleted(Instruction *I) const { return DeletedInstructions.count(I); } + + /// Removes an instruction from its block and eventually deletes it. + void eraseInstruction(Instruction *I) { DeletedInstructions.insert(I); } + + bool processBB(BasicBlock &BB, LLVMContext &Context); + + bool canWidenNode(ArrayRef IL, LLVMContext &Context); + + bool widenNode(ArrayRef IL, LLVMContext &Context); + + void widenCastInst(ArrayRef IL); + + void widenBinaryOperator(ArrayRef IL); + + InstructionCost getOpCost(unsigned Opcode, Type *To, Type *From, + Instruction *I); +}; +} // namespace + +void VectorWiden::widenCastInst(ArrayRef IL) { + Instruction *I = IL[0]; + Instruction *I1 = IL[1]; + auto *RetOrigType = cast(I->getType()); + auto *OrigType = cast(I->getOperand(0)->getType()); + auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType); + auto *OpType = VectorType::getDoubleElementsVectorType(OrigType); + + bool isBitCast = I->getOpcode() == Instruction::BitCast; + unsigned Offset = + dyn_cast(OrigType) + ? (cast(OrigType))->getMinNumElements() + : (cast(OrigType))->getNumElements(); + unsigned BitCastOffsetExtract = + (dyn_cast(RetType) + ? (cast(RetType))->getMinNumElements() + : (cast(RetType))->getNumElements()) / + 2; + Value *WideVec = UndefValue::get(OpType); + Builder.SetInsertPoint(I); + Function *InsertIntr = llvm::Intrinsic::getDeclaration( + F.getParent(), Intrinsic::vector_insert, {OpType, OrigType}); + Value *Insert1 = Builder.CreateCall( + InsertIntr, {WideVec, I->getOperand(0), Builder.getInt64(0)}); + Value *Insert2 = Builder.CreateCall( + InsertIntr, {Insert1, I1->getOperand(0), Builder.getInt64(Offset)}); + Value *ResCast = Builder.CreateCast(Instruction::CastOps(I->getOpcode()), + Insert2, RetType); + + Function *ExtractIntr = llvm::Intrinsic::getDeclaration( + F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType}); + if (!I->users().empty()) { + Value *Res = + Builder.CreateCall(ExtractIntr, {ResCast, Builder.getInt64(0)}); + I->replaceAllUsesWith(Res); + } + if (!I1->users().empty()) { + Value *Res = Builder.CreateCall( + ExtractIntr, + {ResCast, Builder.getInt64(isBitCast ? BitCastOffsetExtract : Offset)}); + I1->replaceAllUsesWith(Res); + } +} + +void VectorWiden::widenBinaryOperator(ArrayRef IL) { + Instruction *I = IL[0]; + Instruction *I1 = IL[1]; + + Value *XHi = I->getOperand(0); + Value *XLo = I1->getOperand(0); + Value *YHi = I->getOperand(1); + Value *YLo = I1->getOperand(1); + + auto *RetOrigType = cast(I->getType()); + auto *OrigType = cast(I->getOperand(0)->getType()); + auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType); + auto *OpType = VectorType::getDoubleElementsVectorType(OrigType); + unsigned Offset = + dyn_cast(OrigType) + ? (cast(OrigType))->getMinNumElements() + : (cast(OrigType))->getNumElements(); + Value *WideVec = UndefValue::get(OpType); + Builder.SetInsertPoint(I); + Function *InsertIntr = llvm::Intrinsic::getDeclaration( + F.getParent(), Intrinsic::vector_insert, {OpType, OrigType}); + Value *X1 = + Builder.CreateCall(InsertIntr, {WideVec, XLo, Builder.getInt64(0)}); + Value *X2 = + Builder.CreateCall(InsertIntr, {X1, XHi, Builder.getInt64(Offset)}); + Value *Y1 = + Builder.CreateCall(InsertIntr, {WideVec, YLo, Builder.getInt64(0)}); + Value *Y2 = + Builder.CreateCall(InsertIntr, {Y1, YHi, Builder.getInt64(Offset)}); + Value *ResBinOp = + Builder.CreateBinOp((Instruction::BinaryOps)I->getOpcode(), X2, Y2); + ValueList VL; + for (Instruction *I : IL) + VL.push_back(I); + + propagateIRFlags(ResBinOp, VL); + + Function *ExtractIntr = llvm::Intrinsic::getDeclaration( + F.getParent(), Intrinsic::vector_extract, {RetOrigType, RetType}); + if (!I->users().empty()) { + Value *Res = + Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(Offset)}); + I->replaceAllUsesWith(Res); + } + if (!I1->users().empty()) { + Value *Res = + Builder.CreateCall(ExtractIntr, {ResBinOp, Builder.getInt64(0)}); + I1->replaceAllUsesWith(Res); + } +} + +bool VectorWiden::canWidenNode(ArrayRef IL, + LLVMContext &Context) { + if (!OverrideTargetConsiderToWiden && !TTI.considerToWiden(Context, IL)) + return false; + + bool HasSecondOperand = IL[0]->getNumOperands() > 1; + for (int X = 0, E = IL.size(); X < E; X++) { + for (int Y = 0, E = IL.size(); Y < E; Y++) { + if (X == Y) + continue; + if (IL[X] == IL[Y] || IL[X]->getOperand(0) == IL[Y] || + (HasSecondOperand && IL[X]->getOperand(1) == IL[Y])) + return false; + } + if (isDeleted(IL[X]) || !IL[X]->hasOneUse()) + return false; + if (X == 0) + continue; + if (IL[X]->getOpcode() != IL[X - 1]->getOpcode() || + // Ignore if any types are different. + IL[X]->getType() != IL[X - 1]->getType() || + IL[X]->getOperand(0)->getType() != + IL[X - 1]->getOperand(0)->getType() || + IL[X - 1]->comesBefore(IL[X])) + return false; + if (IL[0]->getParent() == IL[X]->user_back()->getParent() && + IL[X]->user_back()->comesBefore(IL[0])) + return false; + } + return true; +} + +bool VectorWiden::widenNode(ArrayRef IL, LLVMContext &Context) { + // Currently, this pass supports only two operations to widen to + // a single operation. + if (IL.size() != 2) + return false; + if (!canWidenNode(IL, Context)) + return false; + + unsigned Opcode = IL[0]->getOpcode(); + + if (dyn_cast(IL[0])) { + if (!OverrideTargetConsiderToWiden) { + auto *OrigType = cast(IL[0]->getOperand(0)->getType()); + auto *RetOrigType = cast(IL[0]->getType()); + InstructionCost Cost = getOpCost(Opcode, RetOrigType, OrigType, IL[0]); + auto *RetType = VectorType::getDoubleElementsVectorType(RetOrigType); + auto *OpType = VectorType::getDoubleElementsVectorType(OrigType); + InstructionCost CostNew = getOpCost(Opcode, RetType, OpType, IL[0]); + if (2 * Cost < CostNew) + return false; + } + LLVM_DEBUG( + dbgs() + << "VW: Decided to widen CastInst, safe to merge node starting with " + << *IL[0] << "\n"); + widenCastInst(IL); + return true; + } + if (dyn_cast(IL[0])) { + if (!OverrideTargetConsiderToWiden) { + auto *OrigType = cast(IL[0]->getOperand(0)->getType()); + auto *OpType = VectorType::getDoubleElementsVectorType(OrigType); + InstructionCost Cost = getOpCost(Opcode, OrigType, OrigType, IL[0]); + InstructionCost CostNew = getOpCost(Opcode, OpType, OpType, IL[0]); + if (2 * Cost < CostNew) + return false; + } + LLVM_DEBUG( + dbgs() + << "VW: Decided to widen BinaryOp, safe to merge node starting with " + << *IL[0] << "\n"); + // We want to propagate here IR flags for the group of operations like + // "fast" flag for float pointer ones or "nuw" for integer instructions. + widenBinaryOperator(IL); + return true; + } + return false; +} + +InstructionCost VectorWiden::getOpCost(unsigned Opcode, Type *To, Type *From, + Instruction *I) { + InstructionCost Cost = 0; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + if (dyn_cast(I)) { + unsigned OpIdx = isa(I) ? 0 : 1; + TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(I->getOperand(0)); + TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(I->getOperand(OpIdx)); + SmallVector Operands(I->operand_values()); + Cost = TTI.getArithmeticInstrCost(I->getOpcode(), To, CostKind, Op1Info, + Op2Info, Operands, I); + } else if (dyn_cast(I)) { + Cost = TTI.getCastInstrCost(Opcode, To, From, TTI::getCastContextHint(I), + CostKind, I); + } + return Cost; +} + +static bool isOperationSupported(Instruction *I) { + unsigned Opcode = I->getOpcode(); + // Currently, we support only those operations, but later we could add more. + if (dyn_cast(I->getType()) && + (I->isBinaryOp() || Opcode == Instruction::SExt || + Opcode == Instruction::ZExt || Opcode == Instruction::FPToUI || + Opcode == Instruction::FPToSI || Opcode == Instruction::FPExt || + Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP || + Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc || + Opcode == Instruction::BitCast)) + return true; + return false; +} + +bool VectorWiden::processBB(BasicBlock &BB, LLVMContext &Context) { + struct Operation { + // Position where the first operation, in the list of operations, + // was discovered and the last instruction in the current basic block. + unsigned Position; + InstrList Ops; + }; + // The key is opertion opcode. + // The value is a list of operations with the first operation position in + // the basic block. + DenseMap Operations; + Instruction *LastInstr = BB.getTerminator(); + unsigned CurrentPosition = 0; + for (BasicBlock::reverse_iterator IP(BB.rbegin()); IP != BB.rend(); + *IP++, ++CurrentPosition) { + Instruction *I = &*IP; + unsigned OpFound = 0; + + if (I->isDebugOrPseudoInst() || isDeleted(I) || !isOperationSupported(I)) + continue; + + unsigned Opcode = I->getOpcode(); + if (Operations.contains(Opcode)) { + Operation *OpRec = &Operations[Opcode]; + // If instructions are too apart then remove old instruction + // and reset position to the next instruction in the list instruction. + if (CurrentPosition - OpRec->Position > MaxInstDistance) { + unsigned NumToDelete = 0; + for (InstrList::iterator It = OpRec->Ops.begin(); + It != OpRec->Ops.end(); ++It) { + Instruction *Instr = *It; + unsigned NewPosition = + std::distance(Instr->getIterator(), LastInstr->getIterator()); + if (CurrentPosition - NewPosition > MaxInstDistance) { + NumToDelete++; + } else { + // Updating Position value to next remaining in range opertion. + OpRec->Position = NewPosition; + LLVM_DEBUG(dbgs() << "VW: Updating node starting with " + << **(OpRec->Ops.begin()) + << " position to : " << NewPosition << "\n"); + break; + } + } + for (unsigned i = 0; i < NumToDelete; ++i) { + LLVM_DEBUG(dbgs() + << "VW: Deleting operation " << **(OpRec->Ops.begin()) + << " from node as out of range." + << "\n"); + OpRec->Ops.erase(OpRec->Ops.begin()); + } + } + // If no operations left in the list, set position to the current. + if (!OpRec->Ops.size()) + OpRec->Position = CurrentPosition; + OpRec->Ops.push_back(I); + LLVM_DEBUG(dbgs() << "VW: Found operation " << *I + << " to add to existing node starting at " + << **(OpRec->Ops.begin()) << " at : " << OpRec->Position + << "\n"); + if (OpRec->Ops.size() > 1) + OpFound = Opcode; + } else { + LLVM_DEBUG(dbgs() << "VW: Found operation " << *I + << " to form a node at : " << CurrentPosition << "\n"); + Operations[Opcode] = {CurrentPosition, {I}}; + } + + if (OpFound && Operations.contains(OpFound)) { + auto *OpRec = &Operations[OpFound]; + for (Instruction *Op : OpRec->Ops) + LLVM_DEBUG(dbgs() << "VW: operation to check : " << *Op << "\n"); + if (!widenNode(OpRec->Ops, Context)) { + LLVM_DEBUG(dbgs() << "VW: Unable use a wider vector for vector ops.\n"); + if (OpRec->Ops.size() > 4) { + LLVM_DEBUG(dbgs() << "VW: Deleting operation " + << **(OpRec->Ops.begin()) << " as unable to widen." + << "\n"); + OpRec->Ops.erase(OpRec->Ops.begin()); + OpRec->Position = std::distance( + (*(OpRec->Ops.begin()))->getIterator(), LastInstr->getIterator()); + } + } else { + for (Instruction *Instr : OpRec->Ops) + eraseInstruction(Instr); + return true; + } + } + } + return false; +} + +bool VectorWiden::run() { + bool Changed = false; + LLVMContext &Context = F.getContext(); + + LLVM_DEBUG(dbgs() << "VW: Function:" << F.getName() << "\n"); + for (BasicBlock &BB : F) { + LLVM_DEBUG(dbgs() << "VW: BB:" << BB.getName() << "\n"); + + // If any transformation is done, then we have to start all over again, + // since we generate new instructions. + while (processBB(BB, Context)) + Changed = true; + } + + if (Changed) + for (auto *I : DeletedInstructions) + RecursivelyDeleteTriviallyDeadInstructions(I); + + return Changed; +} + +PreservedAnalyses VectorWidenPass::run(Function &F, + FunctionAnalysisManager &FAM) { + TargetTransformInfo &TTI = FAM.getResult(F); + + VectorWiden VecWiden(F, TTI); + + if (!VecWiden.run()) + return PreservedAnalyses::all(); + + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/llvm/test/Transforms/VectorWiden/add.ll b/llvm/test/Transforms/VectorWiden/add.ll new file mode 100644 index 0000000000000..05b2eeeb5a9c6 --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/add.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S %s 2>&1 | FileCheck %s + +define void @add( %a, %b, %c, ptr %ptr) { +; CHECK-LABEL: define void @add( +; CHECK-SAME: [[A:%.*]], [[B:%.*]], [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( undef, [[A]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP0]], [[B]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( undef, [[C]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[C]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = add [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP4]], i64 0) +; CHECK-NEXT: store [[TMP6]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds , ptr [[PTR]], i64 1 +; CHECK-NEXT: store [[TMP5]], ptr [[INCDEC_PTR3]], align 16 +; CHECK-NEXT: ret void +; +entry: + %add = add %a, %c + %add4 = add %b, %c + store %add, ptr %ptr, align 16 + %incdec.ptr3 = getelementptr inbounds , ptr %ptr, i64 1 + store %add4, ptr %incdec.ptr3, align 16 + ret void +} + +define void @add_ir_flags( %a, %b, %c, ptr %ptr) { +; CHECK-LABEL: define void @add_ir_flags( +; CHECK-SAME: [[A:%.*]], [[B:%.*]], [[C:%.*]], ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( undef, [[A]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP0]], [[B]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( undef, [[C]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8i32.nxv4i32( [[TMP2]], [[C]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = add nuw [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4i32.nxv8i32( [[TMP4]], i64 0) +; CHECK-NEXT: store [[TMP6]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds , ptr [[PTR]], i64 1 +; CHECK-NEXT: store [[TMP5]], ptr [[INCDEC_PTR3]], align 16 +; CHECK-NEXT: ret void +; +entry: + %add = add nuw nsw %a, %c + %add4 = add nuw %b, %c + store %add, ptr %ptr, align 16 + %incdec.ptr3 = getelementptr inbounds , ptr %ptr, i64 1 + store %add4, ptr %incdec.ptr3, align 16 + ret void +} diff --git a/llvm/test/Transforms/VectorWiden/bad-dependace.ll b/llvm/test/Transforms/VectorWiden/bad-dependace.ll new file mode 100644 index 0000000000000..9e901037dcd2f --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/bad-dependace.ll @@ -0,0 +1,54 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s + +define void @fptrunc( %a, %b, ptr %ptr) { +; CHECK-LABEL: @fptrunc( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = fptrunc [[A:%.*]] to +; CHECK-NEXT: [[EXTR:%.*]] = call @llvm.vector.extract.nxv1f16.nxv4f16( [[TMP3]], i64 0) +; CHECK-NEXT: [[EXTEND:%.*]] = fpext [[EXTR]] to +; CHECK-NEXT: [[INS:%.*]] = call @llvm.vector.insert.nxv4f32.nxv1f32( [[B:%.*]], [[EXTEND]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = fptrunc [[INS]] to +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0 +; CHECK-NEXT: store [[TMP3]], ptr [[TMP5]], align 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds half, ptr [[TMP5]], i64 [[TMP2]] +; CHECK-NEXT: store [[TMP4]], ptr [[TMP6]], align 2 +; CHECK-NEXT: ret void +; + %1 = tail call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %1, 2 + %3 = fptrunc %a to + %extr = call @llvm.vector.extract.nxv1f16.nxv4f16( %3, i64 0) + %extend = fpext %extr to + %ins = call @llvm.vector.insert.nxv4f32.nxv1f32( %b, %extend, i64 0) + %4 = fptrunc %ins to + %5 = getelementptr inbounds half, ptr %ptr, i64 0 + store %3, ptr %5, align 2 + %6 = getelementptr inbounds half, ptr %5, i64 %2 + store %4, ptr %6, align 2 + ret void +} + +define void @add( %a, %b, ptr %ptr) { +; CHECK-LABEL: @add( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[ADD:%.*]] = add [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[ADD4:%.*]] = add [[ADD]], [[B]] +; CHECK-NEXT: store [[ADD]], ptr [[PTR:%.*]], align 16 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds , ptr [[PTR]], i64 1 +; CHECK-NEXT: store [[ADD4]], ptr [[INCDEC_PTR3]], align 16 +; CHECK-NEXT: ret void +; +entry: + %add = add %a, %b + %add4 = add %add, %b + store %add, ptr %ptr, align 16 + %incdec.ptr3 = getelementptr inbounds , ptr %ptr, i64 1 + store %add4, ptr %incdec.ptr3, align 16 + ret void +} + +declare i64 @llvm.vscale.i64() +declare @llvm.vector.extract.nxv1f16.nxv4f16(, i64 immarg) +declare @llvm.vector.insert.nxv4f32.nxv1f32(, , i64 immarg) diff --git a/llvm/test/Transforms/VectorWiden/bitcast.ll b/llvm/test/Transforms/VectorWiden/bitcast.ll new file mode 100644 index 0000000000000..c40780653e981 --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/bitcast.ll @@ -0,0 +1,66 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s + + +define void @bitcast1(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @bitcast1( +; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i64> [[TMP1]] to <32 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i32> @llvm.vector.extract.v16i32.v32i32(<32 x i32> [[TMP2]], i64 16) +; CHECK-NEXT: store <16 x i32> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <16 x i32> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast <8 x i64> %a to <16 x i32> + %1 = bitcast <8 x i64> %b to <16 x i32> + store <16 x i32> %0, ptr %ptr, align 16 + store <16 x i32> %1, ptr %ptr1, align 16 + ret void +} + +define void @bitcast2(<4 x i64> %a, <4 x i64> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @bitcast2( +; CHECK-SAME: <4 x i64> [[A:%.*]], <4 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> undef, <4 x i64> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i64> @llvm.vector.insert.v8i64.v4i64(<8 x i64> [[TMP0]], <4 x i64> [[A]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <8 x i64> [[TMP1]] to <64 x i8> +; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.vector.extract.v32i8.v64i8(<64 x i8> [[TMP2]], i64 32) +; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <32 x i8> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast <4 x i64> %a to <32 x i8> + %1 = bitcast <4 x i64> %b to <32 x i8> + store <32 x i8> %0, ptr %ptr, align 16 + store <32 x i8> %1, ptr %ptr1, align 16 + ret void +} + +define void @bitcast3(<32 x i8> %a, <32 x i8> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @bitcast3( +; CHECK-SAME: <32 x i8> [[A:%.*]], <32 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> undef, <32 x i8> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <64 x i8> @llvm.vector.insert.v64i8.v32i8(<64 x i8> [[TMP0]], <32 x i8> [[A]], i64 32) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast <64 x i8> [[TMP1]] to <8 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i64> @llvm.vector.extract.v4i64.v8i64(<8 x i64> [[TMP2]], i64 4) +; CHECK-NEXT: store <4 x i64> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <4 x i64> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = bitcast <32 x i8> %a to <4 x i64> + %1 = bitcast <32 x i8> %b to <4 x i64> + store <4 x i64> %0, ptr %ptr, align 16 + store <4 x i64> %1, ptr %ptr1, align 16 + ret void +} diff --git a/llvm/test/Transforms/VectorWiden/ext-trunc.ll b/llvm/test/Transforms/VectorWiden/ext-trunc.ll new file mode 100644 index 0000000000000..27db8440a499c --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/ext-trunc.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s + + +define void @sext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @sext( +; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[TMP1]] to <16 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = sext <8 x i8> %a to <8 x i64> + %1 = sext <8 x i8> %b to <8 x i64> + store <8 x i64> %0, ptr %ptr, align 16 + store <8 x i64> %1, ptr %ptr1, align 16 + ret void +} + +define void @zext(<8 x i8> %a, <8 x i8> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @zext( +; CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> undef, <8 x i8> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP0]], <8 x i8> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[TMP1]] to <16 x i64> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i64> @llvm.vector.extract.v8i64.v16i64(<16 x i64> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x i64> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x i64> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = zext <8 x i8> %a to <8 x i64> + %1 = zext <8 x i8> %b to <8 x i64> + store <8 x i64> %0, ptr %ptr, align 16 + store <8 x i64> %1, ptr %ptr1, align 16 + ret void +} + +define void @trunc(<8 x i64> %a, <8 x i64> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @trunc( +; CHECK-SAME: <8 x i64> [[A:%.*]], <8 x i64> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> undef, <8 x i64> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i64> @llvm.vector.insert.v16i64.v8i64(<16 x i64> [[TMP0]], <8 x i64> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[TMP1]] to <16 x i8> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x i8> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x i8> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = trunc <8 x i64> %a to <8 x i8> + %1 = trunc <8 x i64> %b to <8 x i8> + store <8 x i8> %0, ptr %ptr, align 16 + store <8 x i8> %1, ptr %ptr1, align 16 + ret void +} + diff --git a/llvm/test/Transforms/VectorWiden/fadd.ll b/llvm/test/Transforms/VectorWiden/fadd.ll new file mode 100644 index 0000000000000..01070ba824c3e --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/fadd.ll @@ -0,0 +1,52 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s + +define void @add( %a, %b, %c, ptr %ptr) { +; CHECK-LABEL: define void @add( +; CHECK-SAME: [[A:%.*]], [[B:%.*]], [[C:%.*]], ptr [[PTR:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( undef, [[A]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP0]], [[B]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( undef, [[C]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[C]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = fadd [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4f32.nxv8f32( [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4f32.nxv8f32( [[TMP4]], i64 0) +; CHECK-NEXT: store [[TMP6]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds , ptr [[PTR]], i64 1 +; CHECK-NEXT: store [[TMP5]], ptr [[INCDEC_PTR3]], align 16 +; CHECK-NEXT: ret void +; +entry: + %add = fadd %a, %c + %add4 = fadd %b, %c + store %add, ptr %ptr, align 16 + %incdec.ptr3 = getelementptr inbounds , ptr %ptr, i64 1 + store %add4, ptr %incdec.ptr3, align 16 + ret void +} + +define void @add_ir_flags( %a, %b, %c, ptr %ptr) { +; CHECK-LABEL: define void @add_ir_flags( +; CHECK-SAME: [[A:%.*]], [[B:%.*]], [[C:%.*]], ptr [[PTR:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( undef, [[A]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP0]], [[B]], i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( undef, [[C]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP2]], [[C]], i64 4) +; CHECK-NEXT: [[TMP4:%.*]] = fadd nnan ninf [[TMP1]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv4f32.nxv8f32( [[TMP4]], i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4f32.nxv8f32( [[TMP4]], i64 0) +; CHECK-NEXT: store [[TMP6]], ptr [[PTR]], align 16 +; CHECK-NEXT: [[INCDEC_PTR3:%.*]] = getelementptr inbounds , ptr [[PTR]], i64 1 +; CHECK-NEXT: store [[TMP5]], ptr [[INCDEC_PTR3]], align 16 +; CHECK-NEXT: ret void +; +entry: + %add = fadd fast nnan %a, %c + %add4 = fadd nnan ninf %b, %c + store %add, ptr %ptr, align 16 + %incdec.ptr3 = getelementptr inbounds , ptr %ptr, i64 1 + store %add4, ptr %incdec.ptr3, align 16 + ret void +} diff --git a/llvm/test/Transforms/VectorWiden/fp-ext.ll b/llvm/test/Transforms/VectorWiden/fp-ext.ll new file mode 100644 index 0000000000000..a4fadd8dbaabf --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/fp-ext.ll @@ -0,0 +1,24 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s + + +define void @fp_ext(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @fp_ext( +; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = fpext <16 x half> [[TMP1]] to <16 x double> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x double> @llvm.vector.extract.v8f64.v16f64(<16 x double> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x double> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x double> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = fpext <8 x half> %a to <8 x double> + %1 = fpext <8 x half> %b to <8 x double> + store <8 x double> %0, ptr %ptr, align 16 + store <8 x double> %1, ptr %ptr1, align 16 + ret void +} diff --git a/llvm/test/Transforms/VectorWiden/fp-int.ll b/llvm/test/Transforms/VectorWiden/fp-int.ll new file mode 100644 index 0000000000000..1e1ebd2939b36 --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/fp-int.ll @@ -0,0 +1,86 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3 +; RUN: opt -passes=vector-widen -vw-override-target-consider-to-widen=1 -S %s 2>&1 | FileCheck %s + +define void @fptosi(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @fptosi( +; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <16 x half> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = fptosi <8 x half> %a to <8 x i16> + %1 = fptosi <8 x half> %b to <8 x i16> + store <8 x i16> %0, ptr %ptr, align 16 + store <8 x i16> %1, ptr %ptr1, align 16 + ret void +} + +define void @sitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @sitofp( +; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = sitofp <16 x i16> [[TMP1]] to <16 x half> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x half> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = sitofp <8 x i16> %a to <8 x half> + %1 = sitofp <8 x i16> %b to <8 x half> + store <8 x half> %0, ptr %ptr, align 16 + store <8 x half> %1, ptr %ptr1, align 16 + ret void +} + +define void @fptoui(<8 x half> %a, <8 x half> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @fptoui( +; CHECK-SAME: <8 x half> [[A:%.*]], <8 x half> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> undef, <8 x half> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x half> @llvm.vector.insert.v16f16.v8f16(<16 x half> [[TMP0]], <8 x half> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = fptoui <16 x half> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x i16> @llvm.vector.extract.v8i16.v16i16(<16 x i16> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x i16> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x i16> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = fptoui <8 x half> %a to <8 x i16> + %1 = fptoui <8 x half> %b to <8 x i16> + store <8 x i16> %0, ptr %ptr, align 16 + store <8 x i16> %1, ptr %ptr1, align 16 + ret void +} + +define void @uitofp(<8 x i16> %a, <8 x i16> %b, ptr %ptr, ptr %ptr1) { +; CHECK-LABEL: define void @uitofp( +; CHECK-SAME: <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], ptr [[PTR:%.*]], ptr [[PTR1:%.*]]) { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> undef, <8 x i16> [[B]], i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP0]], <8 x i16> [[A]], i64 8) +; CHECK-NEXT: [[TMP2:%.*]] = uitofp <16 x i16> [[TMP1]] to <16 x half> +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x half> @llvm.vector.extract.v8f16.v16f16(<16 x half> [[TMP2]], i64 8) +; CHECK-NEXT: store <8 x half> [[TMP4]], ptr [[PTR]], align 16 +; CHECK-NEXT: store <8 x half> [[TMP3]], ptr [[PTR1]], align 16 +; CHECK-NEXT: ret void +; +entry: + %0 = uitofp <8 x i16> %a to <8 x half> + %1 = uitofp <8 x i16> %b to <8 x half> + store <8 x half> %0, ptr %ptr, align 16 + store <8 x half> %1, ptr %ptr1, align 16 + ret void +} diff --git a/llvm/test/Transforms/VectorWiden/fptrunc.ll b/llvm/test/Transforms/VectorWiden/fptrunc.ll new file mode 100644 index 0000000000000..4c19abf852ead --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/fptrunc.ll @@ -0,0 +1,30 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=vector-widen -mtriple aarch64-linux-gnu -mattr=+sme2 -S 2>&1 | FileCheck %s + +define void @fptrunc( %a, %b, ptr %ptr) { +; CHECK-LABEL: @fptrunc( +; CHECK-NEXT: [[TMP1:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( undef, [[B:%.*]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call @llvm.vector.insert.nxv8f32.nxv4f32( [[TMP3]], [[A:%.*]], i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = fptrunc [[TMP4]] to +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv4f16.nxv8f16( [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv4f16.nxv8f16( [[TMP5]], i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds half, ptr [[PTR:%.*]], i64 0 +; CHECK-NEXT: store [[TMP7]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds half, ptr [[TMP8]], i64 [[TMP2]] +; CHECK-NEXT: store [[TMP6]], ptr [[TMP9]], align 2 +; CHECK-NEXT: ret void +; + %1 = tail call i64 @llvm.vscale.i64() + %2 = shl nuw nsw i64 %1, 2 + %3 = fptrunc %a to + %4 = fptrunc %b to + %5 = getelementptr inbounds half, ptr %ptr, i64 0 + store %3, ptr %5, align 2 + %6 = getelementptr inbounds half, ptr %5, i64 %2 + store %4, ptr %6, align 2 + ret void +} + +declare i64 @llvm.vscale.i64() diff --git a/llvm/test/Transforms/VectorWiden/widen-distance.ll b/llvm/test/Transforms/VectorWiden/widen-distance.ll new file mode 100644 index 0000000000000..bc6d0682f8475 --- /dev/null +++ b/llvm/test/Transforms/VectorWiden/widen-distance.ll @@ -0,0 +1,59 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=vector-widen -S -vw-override-target-consider-to-widen=1 -vw-max-instr-distance=2 < %s | FileCheck %s + +define <4 x i32> @foo(float %a0, float %a1, float %a2, float %a3, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: @foo( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr [[PTR1]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP3]], <4 x float> [[TMP1]], i64 4) +; CHECK-NEXT: [[TMP5:%.*]] = fptosi <8 x float> [[TMP4]] to <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP5]], i64 4) +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x float> poison, float [[A0:%.*]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[A1:%.*]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[A2:%.*]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x float> [[TMP10]], float [[A3:%.*]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = fptosi <4 x float> [[TMP11]] to <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[PTR1]], align 2 +; CHECK-NEXT: store <4 x i32> [[TMP12]], ptr [[PTR2:%.*]], align 2 +; CHECK-NEXT: ret <4 x i32> [[TMP7]] +; + %1 = load <4 x float>, ptr %ptr1 + %2 = fptosi <4 x float> %1 to <4 x i32> + %3 = load <4 x float>, ptr %ptr1 + %4 = fptosi <4 x float> %3 to <4 x i32> + %5 = insertelement <4 x float> poison, float %a0, i32 0 + %6 = insertelement <4 x float> %5, float %a1, i32 1 + %7 = insertelement <4 x float> %6, float %a2, i32 2 + %8 = insertelement <4 x float> %7, float %a3, i32 3 + %9 = fptosi <4 x float> %8 to <4 x i32> + store <4 x i32> %4, ptr %ptr1, align 2 + store <4 x i32> %9, ptr %ptr2, align 2 + ret <4 x i32> %2 +} + +define <4 x i32> @bar(<4 x float> %a0, ptr %ptr1, ptr %ptr2) { +; CHECK-LABEL: @bar( +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[PTR1:%.*]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = fptosi <4 x float> [[TMP1]] to <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x float>, ptr [[PTR1]], align 16 +; CHECK-NEXT: [[TMP4:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> undef, <4 x float> [[A0:%.*]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP4]], <4 x float> [[TMP3]], i64 4) +; CHECK-NEXT: [[TMP6:%.*]] = fptosi <8 x float> [[TMP5]] to <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP8:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.v8i32(<8 x i32> [[TMP6]], i64 4) +; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[PTR1]], align 2 +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[PTR2:%.*]], align 2 +; CHECK-NEXT: ret <4 x i32> [[TMP2]] +; + + %1 = load <4 x float>, ptr %ptr1 + %2 = fptosi <4 x float> %1 to <4 x i32> + %3 = load <4 x float>, ptr %ptr1 + %4 = fptosi <4 x float> %3 to <4 x i32> + %5 = fptosi <4 x float> %a0 to <4 x i32> + store <4 x i32> %4, ptr %ptr1, align 2 + store <4 x i32> %5, ptr %ptr2, align 2 + ret <4 x i32> %2 +} diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn index ca67426e08699..f5ef7bbd7106a 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn @@ -19,5 +19,6 @@ static_library("Vectorize") { "VPlanVerifier.cpp", "VectorCombine.cpp", "Vectorize.cpp", + "VectorWiden.cpp", ] }