diff --git a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h index 80c3f187be8cd..5524b55b81b58 100644 --- a/llvm/include/llvm/Transforms/Scalar/Scalarizer.h +++ b/llvm/include/llvm/Transforms/Scalar/Scalarizer.h @@ -7,8 +7,9 @@ //===----------------------------------------------------------------------===// // /// \file -/// This pass converts vector operations into scalar operations, in order -/// to expose optimization opportunities on the individual scalar operations. +/// This pass converts vector operations into scalar operations (or, optionally, +/// operations on smaller vector widths), in order to expose optimization +/// opportunities on the individual scalar operations. /// It is mainly intended for targets that do not have vector units, but it /// may also be useful for revectorizing code to different vector widths. // @@ -26,24 +27,29 @@ class Function; class FunctionPass; struct ScalarizerPassOptions { - // These optional booleans correspond 1:1 to cl::opt options defined in + // These options correspond 1:1 to cl::opt options defined in // Scalarizer.cpp. When the cl::opt are specified, they take precedence. - // When the cl::opt are not specified, the present optional booleans allow to + // When the cl::opt are not specified, the present optional values allow to // override the cl::opt's default values. std::optional ScalarizeVariableInsertExtract; std::optional ScalarizeLoadStore; + std::optional ScalarizeMinBits; }; class ScalarizerPass : public PassInfoMixin { ScalarizerPassOptions Options; public: + ScalarizerPass() = default; + ScalarizerPass(const ScalarizerPassOptions &Options) : Options(Options) {} + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); void setScalarizeVariableInsertExtract(bool Value) { Options.ScalarizeVariableInsertExtract = Value; } void setScalarizeLoadStore(bool Value) { Options.ScalarizeLoadStore = Value; } + void setScalarizeMinBits(unsigned Value) { Options.ScalarizeMinBits = Value; } }; /// Create a legacy pass manager instance of the Scalarizer pass diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp index f321bea17d7d6..6190392a7e1b4 100644 --- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp +++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp @@ -6,8 +6,9 @@ // //===----------------------------------------------------------------------===// // -// This pass converts vector operations into scalar operations, in order -// to expose optimization opportunities on the individual scalar operations. +// This pass converts vector operations into scalar operations (or, optionally, +// operations on smaller vector widths), in order to expose optimization +// opportunities on the individual scalar operations. // It is mainly intended for targets that do not have vector units, but it // may also be useful for revectorizing code to different vector widths. // @@ -62,6 +63,16 @@ static cl::opt ClScalarizeLoadStore( "scalarize-load-store", cl::init(false), cl::Hidden, cl::desc("Allow the scalarizer pass to scalarize loads and store")); +// Split vectors larger than this size into fragments, where each fragment is +// either a vector no larger than this size or a scalar. +// +// Instructions with operands or results of different sizes that would be split +// into a different number of fragments are currently left as-is. +static cl::opt ClScalarizeMinBits( + "scalarize-min-bits", cl::init(0), cl::Hidden, + cl::desc("Instruct the scalarizer pass to attempt to keep values of a " + "minimum number of bits")); + namespace { BasicBlock::iterator skipPastPhiNodesAndDbg(BasicBlock::iterator Itr) { @@ -88,6 +99,29 @@ using ScatterMap = std::map, ValueVector>; // along with a pointer to their scattered forms. using GatherList = SmallVector, 16>; +struct VectorSplit { + // The type of the vector. + FixedVectorType *VecTy = nullptr; + + // The number of elements packed in a fragment (other than the remainder). + unsigned NumPacked = 0; + + // The number of fragments (scalars or smaller vectors) into which the vector + // shall be split. + unsigned NumFragments = 0; + + // The type of each complete fragment. + Type *SplitTy = nullptr; + + // The type of the remainder (last) fragment; null if all fragments are + // complete. + Type *RemainderTy = nullptr; + + Type *getFragmentType(unsigned I) const { + return RemainderTy && I == NumFragments - 1 ? RemainderTy : SplitTy; + } +}; + // Provides a very limited vector-like interface for lazily accessing one // component of a scattered vector or vector pointer. class Scatterer { @@ -97,23 +131,23 @@ class Scatterer { // Scatter V into Size components. If new instructions are needed, // insert them before BBI in BB. If Cache is nonnull, use it to cache // the results. - Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, Type *PtrElemTy, - ValueVector *cachePtr = nullptr); + Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, + const VectorSplit &VS, ValueVector *cachePtr = nullptr); // Return component I, creating a new Value for it if necessary. Value *operator[](unsigned I); // Return the number of components. - unsigned size() const { return Size; } + unsigned size() const { return VS.NumFragments; } private: BasicBlock *BB; BasicBlock::iterator BBI; Value *V; - Type *PtrElemTy; + VectorSplit VS; + bool IsPointer; ValueVector *CachePtr; ValueVector Tmp; - unsigned Size; }; // FCmpSplitter(FCI)(Builder, X, Y, Name) uses Builder to create an FCmp @@ -171,24 +205,74 @@ struct BinarySplitter { struct VectorLayout { VectorLayout() = default; - // Return the alignment of element I. - Align getElemAlign(unsigned I) { - return commonAlignment(VecAlign, I * ElemSize); + // Return the alignment of fragment Frag. + Align getFragmentAlign(unsigned Frag) { + return commonAlignment(VecAlign, Frag * SplitSize); } - // The type of the vector. - FixedVectorType *VecTy = nullptr; - - // The type of each element. - Type *ElemTy = nullptr; + // The split of the underlying vector type. + VectorSplit VS; // The alignment of the vector. Align VecAlign; - // The size of each element. - uint64_t ElemSize = 0; + // The size of each (non-remainder) fragment in bytes. + uint64_t SplitSize = 0; }; +/// Concatenate the given fragments to a single vector value of the type +/// described in @p VS. +static Value *concatenate(IRBuilder<> &Builder, ArrayRef Fragments, + const VectorSplit &VS, Twine Name) { + unsigned NumElements = VS.VecTy->getNumElements(); + SmallVector ExtendMask; + SmallVector InsertMask; + + if (VS.NumPacked > 1) { + // Prepare the shufflevector masks once and re-use them for all + // fragments. + ExtendMask.resize(NumElements, -1); + for (unsigned I = 0; I < VS.NumPacked; ++I) + ExtendMask[I] = I; + + InsertMask.resize(NumElements); + for (unsigned I = 0; I < NumElements; ++I) + InsertMask[I] = I; + } + + Value *Res = PoisonValue::get(VS.VecTy); + for (unsigned I = 0; I < VS.NumFragments; ++I) { + Value *Fragment = Fragments[I]; + + unsigned NumPacked = VS.NumPacked; + if (I == VS.NumFragments - 1 && VS.RemainderTy) { + if (auto *RemVecTy = dyn_cast(VS.RemainderTy)) + NumPacked = RemVecTy->getNumElements(); + else + NumPacked = 1; + } + + if (NumPacked == 1) { + Res = Builder.CreateInsertElement(Res, Fragment, I * VS.NumPacked, + Name + ".upto" + Twine(I)); + } else { + Fragment = Builder.CreateShuffleVector(Fragment, Fragment, ExtendMask); + if (I == 0) { + Res = Fragment; + } else { + for (unsigned J = 0; J < NumPacked; ++J) + InsertMask[I * VS.NumPacked + J] = NumElements + J; + Res = Builder.CreateShuffleVector(Res, Fragment, InsertMask, + Name + ".upto" + Twine(I)); + for (unsigned J = 0; J < NumPacked; ++J) + InsertMask[I * VS.NumPacked + J] = I * VS.NumPacked + J; + } + } + } + + return Res; +} + template T getWithDefaultOverride(const cl::opt &ClOption, const std::optional &DefaultOverride) { @@ -205,8 +289,9 @@ class ScalarizerVisitor : public InstVisitor { getWithDefaultOverride(ClScalarizeVariableInsertExtract, Options.ScalarizeVariableInsertExtract)), ScalarizeLoadStore(getWithDefaultOverride(ClScalarizeLoadStore, - Options.ScalarizeLoadStore)) { - } + Options.ScalarizeLoadStore)), + ScalarizeMinBits(getWithDefaultOverride(ClScalarizeMinBits, + Options.ScalarizeMinBits)) {} bool visit(Function &F); @@ -231,11 +316,12 @@ class ScalarizerVisitor : public InstVisitor { bool visitFreezeInst(FreezeInst &FI); private: - Scatterer scatter(Instruction *Point, Value *V, Type *PtrElemTy = nullptr); - void gather(Instruction *Op, const ValueVector &CV); + Scatterer scatter(Instruction *Point, Value *V, const VectorSplit &VS); + void gather(Instruction *Op, const ValueVector &CV, const VectorSplit &VS); void replaceUses(Instruction *Op, Value *CV); bool canTransferMetadata(unsigned Kind); void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV); + std::optional getVectorSplit(Type *Ty); std::optional getVectorLayout(Type *Ty, Align Alignment, const DataLayout &DL); bool finish(); @@ -257,6 +343,7 @@ class ScalarizerVisitor : public InstVisitor { const bool ScalarizeVariableInsertExtract; const bool ScalarizeLoadStore; + const unsigned ScalarizeMinBits; }; class ScalarizerLegacyPass : public FunctionPass { @@ -285,42 +372,54 @@ INITIALIZE_PASS_END(ScalarizerLegacyPass, "scalarizer", "Scalarize vector operations", false, false) Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v, - Type *PtrElemTy, ValueVector *cachePtr) - : BB(bb), BBI(bbi), V(v), PtrElemTy(PtrElemTy), CachePtr(cachePtr) { + const VectorSplit &VS, ValueVector *cachePtr) + : BB(bb), BBI(bbi), V(v), VS(VS), CachePtr(cachePtr) { Type *Ty = V->getType(); if (Ty->isPointerTy()) { - assert(cast(Ty)->isOpaqueOrPointeeTypeMatches(PtrElemTy) && + assert(cast(Ty)->isOpaqueOrPointeeTypeMatches(VS.VecTy) && "Pointer element type mismatch"); - Ty = PtrElemTy; + IsPointer = true; + } else { + IsPointer = false; + } + if (!CachePtr) { + Tmp.resize(VS.NumFragments, nullptr); + } else { + assert((CachePtr->empty() || VS.NumFragments == CachePtr->size() || + IsPointer) && + "Inconsistent vector sizes"); + if (VS.NumFragments > CachePtr->size()) + CachePtr->resize(VS.NumFragments, nullptr); } - Size = cast(Ty)->getNumElements(); - if (!CachePtr) - Tmp.resize(Size, nullptr); - else if (CachePtr->empty()) - CachePtr->resize(Size, nullptr); - else - assert(Size == CachePtr->size() && "Inconsistent vector sizes"); } -// Return component I, creating a new Value for it if necessary. -Value *Scatterer::operator[](unsigned I) { - ValueVector &CV = (CachePtr ? *CachePtr : Tmp); +// Return fragment Frag, creating a new Value for it if necessary. +Value *Scatterer::operator[](unsigned Frag) { + ValueVector &CV = CachePtr ? *CachePtr : Tmp; // Try to reuse a previous value. - if (CV[I]) - return CV[I]; + if (CV[Frag]) + return CV[Frag]; IRBuilder<> Builder(BB, BBI); - if (PtrElemTy) { - Type *VectorElemTy = cast(PtrElemTy)->getElementType(); - if (!CV[0]) { - Type *NewPtrTy = PointerType::get( - VectorElemTy, V->getType()->getPointerAddressSpace()); - CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0"); - } - if (I != 0) - CV[I] = Builder.CreateConstGEP1_32(VectorElemTy, CV[0], I, - V->getName() + ".i" + Twine(I)); + if (IsPointer) { + if (Frag == 0) + CV[Frag] = V; + else + CV[Frag] = Builder.CreateConstGEP1_32(VS.SplitTy, V, Frag, + V->getName() + ".i" + Twine(Frag)); + return CV[Frag]; + } + + Type *FragmentTy = VS.getFragmentType(Frag); + + if (auto *VecTy = dyn_cast(FragmentTy)) { + SmallVector Mask; + for (unsigned J = 0; J < VecTy->getNumElements(); ++J) + Mask.push_back(Frag * VS.NumPacked + J); + CV[Frag] = + Builder.CreateShuffleVector(V, PoisonValue::get(V->getType()), Mask, + V->getName() + ".i" + Twine(Frag)); } else { - // Search through a chain of InsertElementInsts looking for element I. + // Search through a chain of InsertElementInsts looking for element Frag. // Record other elements in the cache. The new V is still suitable // for all uncached indices. while (true) { @@ -332,19 +431,23 @@ Value *Scatterer::operator[](unsigned I) { break; unsigned J = Idx->getZExtValue(); V = Insert->getOperand(0); - if (I == J) { - CV[J] = Insert->getOperand(1); - return CV[J]; - } else if (!CV[J]) { + if (Frag * VS.NumPacked == J) { + CV[Frag] = Insert->getOperand(1); + return CV[Frag]; + } + + if (VS.NumPacked == 1 && !CV[J]) { // Only cache the first entry we find for each index we're not actively // searching for. This prevents us from going too far up the chain and // caching incorrect entries. CV[J] = Insert->getOperand(1); } } - CV[I] = Builder.CreateExtractElement(V, I, V->getName() + ".i" + Twine(I)); + CV[Frag] = Builder.CreateExtractElement(V, Frag * VS.NumPacked, + V->getName() + ".i" + Twine(Frag)); } - return CV[I]; + + return CV[Frag]; } bool ScalarizerLegacyPass::runOnFunction(Function &F) { @@ -386,13 +489,13 @@ bool ScalarizerVisitor::visit(Function &F) { // Return a scattered form of V that can be accessed by Point. V must be a // vector or a pointer to a vector. Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V, - Type *PtrElemTy) { + const VectorSplit &VS) { if (Argument *VArg = dyn_cast(V)) { // Put the scattered form of arguments in the entry block, // so that it can be used everywhere. Function *F = VArg->getParent(); BasicBlock *BB = &F->getEntryBlock(); - return Scatterer(BB, BB->begin(), V, PtrElemTy, &Scattered[{V, PtrElemTy}]); + return Scatterer(BB, BB->begin(), V, VS, &Scattered[{V, VS.SplitTy}]); } if (Instruction *VOp = dyn_cast(V)) { // When scalarizing PHI nodes we might try to examine/rewrite InsertElement @@ -403,29 +506,30 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V, // need to analyse them further. if (!DT->isReachableFromEntry(VOp->getParent())) return Scatterer(Point->getParent(), Point->getIterator(), - PoisonValue::get(V->getType()), PtrElemTy); + PoisonValue::get(V->getType()), VS); // Put the scattered form of an instruction directly after the // instruction, skipping over PHI nodes and debug intrinsics. BasicBlock *BB = VOp->getParent(); return Scatterer( - BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V, - PtrElemTy, &Scattered[{V, PtrElemTy}]); + BB, skipPastPhiNodesAndDbg(std::next(BasicBlock::iterator(VOp))), V, VS, + &Scattered[{V, VS.SplitTy}]); } // In the fallback case, just put the scattered before Point and // keep the result local to Point. - return Scatterer(Point->getParent(), Point->getIterator(), V, PtrElemTy); + return Scatterer(Point->getParent(), Point->getIterator(), V, VS); } // Replace Op with the gathered form of the components in CV. Defer the // deletion of Op and creation of the gathered form to the end of the pass, // so that we can avoid creating the gathered form if all uses of Op are // replaced with uses of CV. -void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) { +void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV, + const VectorSplit &VS) { transferMetadataAndIRFlags(Op, CV); // If we already have a scattered form of Op (created from ExtractElements // of Op itself), replace them with the new form. - ValueVector &SV = Scattered[{Op, nullptr}]; + ValueVector &SV = Scattered[{Op, VS.SplitTy}]; if (!SV.empty()) { for (unsigned I = 0, E = SV.size(); I != E; ++I) { Value *V = SV[I]; @@ -483,23 +587,57 @@ void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op, } } +// Determine how Ty is split, if at all. +std::optional ScalarizerVisitor::getVectorSplit(Type *Ty) { + VectorSplit Split; + Split.VecTy = dyn_cast(Ty); + if (!Split.VecTy) + return {}; + + unsigned NumElems = Split.VecTy->getNumElements(); + Type *ElemTy = Split.VecTy->getElementType(); + + if (NumElems == 1 || ElemTy->isPointerTy() || + 2 * ElemTy->getScalarSizeInBits() > ScalarizeMinBits) { + Split.NumPacked = 1; + Split.NumFragments = NumElems; + Split.SplitTy = ElemTy; + } else { + Split.NumPacked = ScalarizeMinBits / ElemTy->getScalarSizeInBits(); + if (Split.NumPacked >= NumElems) + return {}; + + Split.NumFragments = divideCeil(NumElems, Split.NumPacked); + Split.SplitTy = FixedVectorType::get(ElemTy, Split.NumPacked); + + unsigned RemainderElems = NumElems % Split.NumPacked; + if (RemainderElems > 1) + Split.RemainderTy = FixedVectorType::get(ElemTy, RemainderElems); + else if (RemainderElems == 1) + Split.RemainderTy = ElemTy; + } + + return Split; +} + // Try to fill in Layout from Ty, returning true on success. Alignment is // the alignment of the vector, or std::nullopt if the ABI default should be // used. std::optional ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment, const DataLayout &DL) { + std::optional VS = getVectorSplit(Ty); + if (!VS) + return {}; + VectorLayout Layout; - // Make sure we're dealing with a vector. - Layout.VecTy = dyn_cast(Ty); - if (!Layout.VecTy) - return std::nullopt; - // Check that we're dealing with full-byte elements. - Layout.ElemTy = Layout.VecTy->getElementType(); - if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy)) - return std::nullopt; + Layout.VS = *VS; + // Check that we're dealing with full-byte fragments. + if (!DL.typeSizeEqualsStoreSize(VS->SplitTy) || + (VS->RemainderTy && !DL.typeSizeEqualsStoreSize(VS->RemainderTy))) + return {}; Layout.VecAlign = Alignment; - Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy); + Layout.SplitSize = DL.getTypeStoreSize(VS->SplitTy); return Layout; } @@ -507,19 +645,27 @@ ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment, // to create an instruction like I with operand X and name Name. template bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) { - auto *VT = dyn_cast(I.getType()); - if (!VT) + std::optional VS = getVectorSplit(I.getType()); + if (!VS) return false; - unsigned NumElems = VT->getNumElements(); + std::optional OpVS; + if (I.getOperand(0)->getType() == I.getType()) { + OpVS = VS; + } else { + OpVS = getVectorSplit(I.getOperand(0)->getType()); + if (!OpVS || VS->NumPacked != OpVS->NumPacked) + return false; + } + IRBuilder<> Builder(&I); - Scatterer Op = scatter(&I, I.getOperand(0)); - assert(Op.size() == NumElems && "Mismatched unary operation"); + Scatterer Op = scatter(&I, I.getOperand(0), *OpVS); + assert(Op.size() == VS->NumFragments && "Mismatched unary operation"); ValueVector Res; - Res.resize(NumElems); - for (unsigned Elem = 0; Elem < NumElems; ++Elem) - Res[Elem] = Split(Builder, Op[Elem], I.getName() + ".i" + Twine(Elem)); - gather(&I, Res); + Res.resize(VS->NumFragments); + for (unsigned Frag = 0; Frag < VS->NumFragments; ++Frag) + Res[Frag] = Split(Builder, Op[Frag], I.getName() + ".i" + Twine(Frag)); + gather(&I, Res, *VS); return true; } @@ -527,24 +673,32 @@ bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) { // to create an instruction like I with operands X and Y and name Name. template bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) { - auto *VT = dyn_cast(I.getType()); - if (!VT) + std::optional VS = getVectorSplit(I.getType()); + if (!VS) return false; - unsigned NumElems = VT->getNumElements(); + std::optional OpVS; + if (I.getOperand(0)->getType() == I.getType()) { + OpVS = VS; + } else { + OpVS = getVectorSplit(I.getOperand(0)->getType()); + if (!OpVS || VS->NumPacked != OpVS->NumPacked) + return false; + } + IRBuilder<> Builder(&I); - Scatterer VOp0 = scatter(&I, I.getOperand(0)); - Scatterer VOp1 = scatter(&I, I.getOperand(1)); - assert(VOp0.size() == NumElems && "Mismatched binary operation"); - assert(VOp1.size() == NumElems && "Mismatched binary operation"); + Scatterer VOp0 = scatter(&I, I.getOperand(0), *OpVS); + Scatterer VOp1 = scatter(&I, I.getOperand(1), *OpVS); + assert(VOp0.size() == VS->NumFragments && "Mismatched binary operation"); + assert(VOp1.size() == VS->NumFragments && "Mismatched binary operation"); ValueVector Res; - Res.resize(NumElems); - for (unsigned Elem = 0; Elem < NumElems; ++Elem) { - Value *Op0 = VOp0[Elem]; - Value *Op1 = VOp1[Elem]; - Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem)); + Res.resize(VS->NumFragments); + for (unsigned Frag = 0; Frag < VS->NumFragments; ++Frag) { + Value *Op0 = VOp0[Frag]; + Value *Op1 = VOp1[Frag]; + Res[Frag] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Frag)); } - gather(&I, Res); + gather(&I, Res, *VS); return true; } @@ -552,18 +706,11 @@ static bool isTriviallyScalariable(Intrinsic::ID ID) { return isTriviallyVectorizable(ID); } -// All of the current scalarizable intrinsics only have one mangled type. -static Function *getScalarIntrinsicDeclaration(Module *M, - Intrinsic::ID ID, - ArrayRef Tys) { - return Intrinsic::getDeclaration(M, ID, Tys); -} - /// If a call to a vector typed intrinsic function, split into a scalar call per /// element if possible for the intrinsic. bool ScalarizerVisitor::splitCall(CallInst &CI) { - auto *VT = dyn_cast(CI.getType()); - if (!VT) + std::optional VS = getVectorSplit(CI.getType()); + if (!VS) return false; Function *F = CI.getCalledFunction(); @@ -574,28 +721,41 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID)) return false; - unsigned NumElems = VT->getNumElements(); + // unsigned NumElems = VT->getNumElements(); unsigned NumArgs = CI.arg_size(); ValueVector ScalarOperands(NumArgs); SmallVector Scattered(NumArgs); - - Scattered.resize(NumArgs); + SmallVector OverloadIdx(NumArgs, -1); SmallVector Tys; // Add return type if intrinsic is overloaded on it. if (isVectorIntrinsicWithOverloadTypeAtArg(ID, -1)) - Tys.push_back(VT->getScalarType()); + Tys.push_back(VS->SplitTy); // Assumes that any vector type has the same number of elements as the return // vector type, which is true for all current intrinsics. for (unsigned I = 0; I != NumArgs; ++I) { Value *OpI = CI.getOperand(I); - if (OpI->getType()->isVectorTy()) { - Scattered[I] = scatter(&CI, OpI); - assert(Scattered[I].size() == NumElems && "mismatched call operands"); - if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) - Tys.push_back(OpI->getType()->getScalarType()); + if (auto *OpVecTy = dyn_cast(OpI->getType())) { + assert(OpVecTy->getNumElements() == VS->VecTy->getNumElements()); + std::optional OpVS = getVectorSplit(OpI->getType()); + if (!OpVS || OpVS->NumPacked != VS->NumPacked) { + // The natural split of the operand doesn't match the result. This could + // happen if the vector elements are different and the ScalarizeMinBits + // option is used. + // + // We could in principle handle this case as well, at the cost of + // complicating the scattering machinery to support multiple scattering + // granularities for a single value. + return false; + } + + Scattered[I] = scatter(&CI, OpI, *OpVS); + if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) { + OverloadIdx[I] = Tys.size(); + Tys.push_back(OpVS->SplitTy); + } } else { ScalarOperands[I] = OpI; if (isVectorIntrinsicWithOverloadTypeAtArg(ID, I)) @@ -603,49 +763,67 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) { } } - ValueVector Res(NumElems); + ValueVector Res(VS->NumFragments); ValueVector ScalarCallOps(NumArgs); - Function *NewIntrin = getScalarIntrinsicDeclaration(F->getParent(), ID, Tys); + Function *NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); IRBuilder<> Builder(&CI); // Perform actual scalarization, taking care to preserve any scalar operands. - for (unsigned Elem = 0; Elem < NumElems; ++Elem) { + for (unsigned I = 0; I < VS->NumFragments; ++I) { + bool IsRemainder = I == VS->NumFragments - 1 && VS->RemainderTy; ScalarCallOps.clear(); + if (IsRemainder) + Tys[0] = VS->RemainderTy; + for (unsigned J = 0; J != NumArgs; ++J) { - if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) + if (isVectorIntrinsicWithScalarOpAtArg(ID, J)) { ScalarCallOps.push_back(ScalarOperands[J]); - else - ScalarCallOps.push_back(Scattered[J][Elem]); + } else { + ScalarCallOps.push_back(Scattered[J][I]); + if (IsRemainder && OverloadIdx[J] >= 0) + Tys[OverloadIdx[J]] = Scattered[J][I]->getType(); + } } - Res[Elem] = Builder.CreateCall(NewIntrin, ScalarCallOps, - CI.getName() + ".i" + Twine(Elem)); + if (IsRemainder) + NewIntrin = Intrinsic::getDeclaration(F->getParent(), ID, Tys); + + Res[I] = Builder.CreateCall(NewIntrin, ScalarCallOps, + CI.getName() + ".i" + Twine(I)); } - gather(&CI, Res); + gather(&CI, Res, *VS); return true; } bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) { - auto *VT = dyn_cast(SI.getType()); - if (!VT) + std::optional VS = getVectorSplit(SI.getType()); + if (!VS) return false; - unsigned NumElems = VT->getNumElements(); + std::optional CondVS; + if (isa(SI.getCondition()->getType())) { + CondVS = getVectorSplit(SI.getCondition()->getType()); + if (!CondVS || CondVS->NumPacked != VS->NumPacked) { + // This happens when ScalarizeMinBits is used. + return false; + } + } + IRBuilder<> Builder(&SI); - Scatterer VOp1 = scatter(&SI, SI.getOperand(1)); - Scatterer VOp2 = scatter(&SI, SI.getOperand(2)); - assert(VOp1.size() == NumElems && "Mismatched select"); - assert(VOp2.size() == NumElems && "Mismatched select"); + Scatterer VOp1 = scatter(&SI, SI.getOperand(1), *VS); + Scatterer VOp2 = scatter(&SI, SI.getOperand(2), *VS); + assert(VOp1.size() == VS->NumFragments && "Mismatched select"); + assert(VOp2.size() == VS->NumFragments && "Mismatched select"); ValueVector Res; - Res.resize(NumElems); + Res.resize(VS->NumFragments); - if (SI.getOperand(0)->getType()->isVectorTy()) { - Scatterer VOp0 = scatter(&SI, SI.getOperand(0)); - assert(VOp0.size() == NumElems && "Mismatched select"); - for (unsigned I = 0; I < NumElems; ++I) { + if (CondVS) { + Scatterer VOp0 = scatter(&SI, SI.getOperand(0), *CondVS); + assert(VOp0.size() == CondVS->NumFragments && "Mismatched select"); + for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *Op0 = VOp0[I]; Value *Op1 = VOp1[I]; Value *Op2 = VOp2[I]; @@ -654,14 +832,14 @@ bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) { } } else { Value *Op0 = SI.getOperand(0); - for (unsigned I = 0; I < NumElems; ++I) { + for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *Op1 = VOp1[I]; Value *Op2 = VOp2[I]; Res[I] = Builder.CreateSelect(Op0, Op1, Op2, SI.getName() + ".i" + Twine(I)); } } - gather(&SI, Res); + gather(&SI, Res, *VS); return true; } @@ -682,146 +860,194 @@ bool ScalarizerVisitor::visitBinaryOperator(BinaryOperator &BO) { } bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) { - auto *VT = dyn_cast(GEPI.getType()); - if (!VT) + std::optional VS = getVectorSplit(GEPI.getType()); + if (!VS) return false; IRBuilder<> Builder(&GEPI); - unsigned NumElems = VT->getNumElements(); unsigned NumIndices = GEPI.getNumIndices(); - // The base pointer might be scalar even if it's a vector GEP. In those cases, - // splat the pointer into a vector value, and scatter that vector. - Value *Op0 = GEPI.getOperand(0); - if (!Op0->getType()->isVectorTy()) - Op0 = Builder.CreateVectorSplat(NumElems, Op0); - Scatterer Base = scatter(&GEPI, Op0); - - SmallVector Ops; - Ops.resize(NumIndices); - for (unsigned I = 0; I < NumIndices; ++I) { - Value *Op = GEPI.getOperand(I + 1); - - // The indices might be scalars even if it's a vector GEP. In those cases, - // splat the scalar into a vector value, and scatter that vector. - if (!Op->getType()->isVectorTy()) - Op = Builder.CreateVectorSplat(NumElems, Op); - - Ops[I] = scatter(&GEPI, Op); + // The base pointer and indices might be scalar even if it's a vector GEP. + SmallVector ScalarOps{1 + NumIndices}; + SmallVector ScatterOps{1 + NumIndices}; + + for (unsigned I = 0; I < 1 + NumIndices; ++I) { + if (auto *VecTy = + dyn_cast(GEPI.getOperand(I)->getType())) { + std::optional OpVS = getVectorSplit(VecTy); + if (!OpVS || OpVS->NumPacked != VS->NumPacked) { + // This can happen when ScalarizeMinBits is used. + return false; + } + ScatterOps[I] = scatter(&GEPI, GEPI.getOperand(I), *OpVS); + } else { + ScalarOps[I] = GEPI.getOperand(I); + } } ValueVector Res; - Res.resize(NumElems); - for (unsigned I = 0; I < NumElems; ++I) { - SmallVector Indices; - Indices.resize(NumIndices); - for (unsigned J = 0; J < NumIndices; ++J) - Indices[J] = Ops[J][I]; - Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), Base[I], Indices, + Res.resize(VS->NumFragments); + for (unsigned I = 0; I < VS->NumFragments; ++I) { + SmallVector SplitOps; + SplitOps.resize(1 + NumIndices); + for (unsigned J = 0; J < 1 + NumIndices; ++J) { + if (ScalarOps[J]) + SplitOps[J] = ScalarOps[J]; + else + SplitOps[J] = ScatterOps[J][I]; + } + Res[I] = Builder.CreateGEP(GEPI.getSourceElementType(), SplitOps[0], + ArrayRef(SplitOps).drop_front(), GEPI.getName() + ".i" + Twine(I)); if (GEPI.isInBounds()) if (GetElementPtrInst *NewGEPI = dyn_cast(Res[I])) NewGEPI->setIsInBounds(); } - gather(&GEPI, Res); + gather(&GEPI, Res, *VS); return true; } bool ScalarizerVisitor::visitCastInst(CastInst &CI) { - auto *VT = dyn_cast(CI.getDestTy()); - if (!VT) + std::optional DestVS = getVectorSplit(CI.getDestTy()); + if (!DestVS) + return false; + + std::optional SrcVS = getVectorSplit(CI.getSrcTy()); + if (!SrcVS || SrcVS->NumPacked != DestVS->NumPacked) return false; - unsigned NumElems = VT->getNumElements(); IRBuilder<> Builder(&CI); - Scatterer Op0 = scatter(&CI, CI.getOperand(0)); - assert(Op0.size() == NumElems && "Mismatched cast"); + Scatterer Op0 = scatter(&CI, CI.getOperand(0), *SrcVS); + assert(Op0.size() == SrcVS->NumFragments && "Mismatched cast"); ValueVector Res; - Res.resize(NumElems); - for (unsigned I = 0; I < NumElems; ++I) - Res[I] = Builder.CreateCast(CI.getOpcode(), Op0[I], VT->getElementType(), - CI.getName() + ".i" + Twine(I)); - gather(&CI, Res); + Res.resize(DestVS->NumFragments); + for (unsigned I = 0; I < DestVS->NumFragments; ++I) + Res[I] = + Builder.CreateCast(CI.getOpcode(), Op0[I], DestVS->getFragmentType(I), + CI.getName() + ".i" + Twine(I)); + gather(&CI, Res, *DestVS); return true; } bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) { - auto *DstVT = dyn_cast(BCI.getDestTy()); - auto *SrcVT = dyn_cast(BCI.getSrcTy()); - if (!DstVT || !SrcVT) + std::optional DstVS = getVectorSplit(BCI.getDestTy()); + std::optional SrcVS = getVectorSplit(BCI.getSrcTy()); + if (!DstVS || !SrcVS || DstVS->RemainderTy || SrcVS->RemainderTy) return false; - unsigned DstNumElems = DstVT->getNumElements(); - unsigned SrcNumElems = SrcVT->getNumElements(); + const bool isPointerTy = DstVS->VecTy->getElementType()->isPointerTy(); + + // Vectors of pointers are always fully scalarized. + assert(!isPointerTy || (DstVS->NumPacked == 1 && SrcVS->NumPacked == 1)); + IRBuilder<> Builder(&BCI); - Scatterer Op0 = scatter(&BCI, BCI.getOperand(0)); + Scatterer Op0 = scatter(&BCI, BCI.getOperand(0), *SrcVS); ValueVector Res; - Res.resize(DstNumElems); + Res.resize(DstVS->NumFragments); + + unsigned DstSplitBits = DstVS->SplitTy->getPrimitiveSizeInBits(); + unsigned SrcSplitBits = SrcVS->SplitTy->getPrimitiveSizeInBits(); - if (DstNumElems == SrcNumElems) { - for (unsigned I = 0; I < DstNumElems; ++I) - Res[I] = Builder.CreateBitCast(Op0[I], DstVT->getElementType(), + if (isPointerTy || DstSplitBits == SrcSplitBits) { + assert(DstVS->NumFragments == SrcVS->NumFragments); + for (unsigned I = 0; I < DstVS->NumFragments; ++I) { + Res[I] = Builder.CreateBitCast(Op0[I], DstVS->getFragmentType(I), BCI.getName() + ".i" + Twine(I)); - } else if (DstNumElems > SrcNumElems) { - // -> . Convert each t1 to and copy the - // individual elements to the destination. - unsigned FanOut = DstNumElems / SrcNumElems; - auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut); + } + } else if (SrcSplitBits % DstSplitBits == 0) { + // Convert each source fragment to the same-sized destination vector and + // then scatter the result to the destination. + VectorSplit MidVS; + MidVS.NumPacked = DstVS->NumPacked; + MidVS.NumFragments = SrcSplitBits / DstSplitBits; + MidVS.VecTy = FixedVectorType::get(DstVS->VecTy->getElementType(), + MidVS.NumPacked * MidVS.NumFragments); + MidVS.SplitTy = DstVS->SplitTy; + unsigned ResI = 0; - for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) { - Value *V = Op0[Op0I]; - Instruction *VI; + for (unsigned I = 0; I < SrcVS->NumFragments; ++I) { + Value *V = Op0[I]; + // Look through any existing bitcasts before converting to . // In the best case, the resulting conversion might be a no-op. + Instruction *VI; while ((VI = dyn_cast(V)) && VI->getOpcode() == Instruction::BitCast) V = VI->getOperand(0); - V = Builder.CreateBitCast(V, MidTy, V->getName() + ".cast"); - Scatterer Mid = scatter(&BCI, V); - for (unsigned MidI = 0; MidI < FanOut; ++MidI) - Res[ResI++] = Mid[MidI]; + + V = Builder.CreateBitCast(V, MidVS.VecTy, V->getName() + ".cast"); + + Scatterer Mid = scatter(&BCI, V, MidVS); + for (unsigned J = 0; J < MidVS.NumFragments; ++J) + Res[ResI++] = Mid[J]; } - } else { - // -> . Convert each group of into a t2. - unsigned FanIn = SrcNumElems / DstNumElems; - auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn); - unsigned Op0I = 0; - for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) { - Value *V = PoisonValue::get(MidTy); - for (unsigned MidI = 0; MidI < FanIn; ++MidI) - V = Builder.CreateInsertElement(V, Op0[Op0I++], MidI, - BCI.getName() + ".i" + Twine(ResI) + - ".upto" + Twine(MidI)); - Res[ResI] = Builder.CreateBitCast(V, DstVT->getElementType(), - BCI.getName() + ".i" + Twine(ResI)); + } else if (DstSplitBits % SrcSplitBits == 0) { + // Gather enough source fragments to make up a destination fragment and + // then convert to the destination type. + VectorSplit MidVS; + MidVS.NumFragments = DstSplitBits / SrcSplitBits; + MidVS.NumPacked = SrcVS->NumPacked; + MidVS.VecTy = FixedVectorType::get(SrcVS->VecTy->getElementType(), + MidVS.NumPacked * MidVS.NumFragments); + MidVS.SplitTy = SrcVS->SplitTy; + + unsigned SrcI = 0; + SmallVector ConcatOps; + ConcatOps.resize(MidVS.NumFragments); + for (unsigned I = 0; I < DstVS->NumFragments; ++I) { + for (unsigned J = 0; J < MidVS.NumFragments; ++J) + ConcatOps[J] = Op0[SrcI++]; + Value *V = concatenate(Builder, ConcatOps, MidVS, + BCI.getName() + ".i" + Twine(I)); + Res[I] = Builder.CreateBitCast(V, DstVS->getFragmentType(I), + BCI.getName() + ".i" + Twine(I)); } + } else { + return false; } - gather(&BCI, Res); + + gather(&BCI, Res, *DstVS); return true; } bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { - auto *VT = dyn_cast(IEI.getType()); - if (!VT) + std::optional VS = getVectorSplit(IEI.getType()); + if (!VS) return false; - unsigned NumElems = VT->getNumElements(); IRBuilder<> Builder(&IEI); - Scatterer Op0 = scatter(&IEI, IEI.getOperand(0)); + Scatterer Op0 = scatter(&IEI, IEI.getOperand(0), *VS); Value *NewElt = IEI.getOperand(1); Value *InsIdx = IEI.getOperand(2); ValueVector Res; - Res.resize(NumElems); + Res.resize(VS->NumFragments); if (auto *CI = dyn_cast(InsIdx)) { - for (unsigned I = 0; I < NumElems; ++I) - Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I]; + unsigned Idx = CI->getZExtValue(); + unsigned Fragment = Idx / VS->NumPacked; + for (unsigned I = 0; I < VS->NumFragments; ++I) { + if (I == Fragment) { + bool IsPacked = VS->NumPacked > 1; + if (Fragment == VS->NumFragments - 1 && VS->RemainderTy && + !VS->RemainderTy->isVectorTy()) + IsPacked = false; + if (IsPacked) { + Res[I] = + Builder.CreateInsertElement(Op0[I], NewElt, Idx % VS->NumPacked); + } else { + Res[I] = NewElt; + } + } else { + Res[I] = Op0[I]; + } + } } else { - if (!ScalarizeVariableInsertExtract) + // Never split a variable insertelement that isn't fully scalarized. + if (!ScalarizeVariableInsertExtract || VS->NumPacked > 1) return false; - for (unsigned I = 0; I < NumElems; ++I) { + for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *ShouldReplace = Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I), InsIdx->getName() + ".is." + Twine(I)); @@ -831,31 +1057,39 @@ bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) { } } - gather(&IEI, Res); + gather(&IEI, Res, *VS); return true; } bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { - auto *VT = dyn_cast(EEI.getOperand(0)->getType()); - if (!VT) + std::optional VS = getVectorSplit(EEI.getOperand(0)->getType()); + if (!VS) return false; - unsigned NumSrcElems = VT->getNumElements(); IRBuilder<> Builder(&EEI); - Scatterer Op0 = scatter(&EEI, EEI.getOperand(0)); + Scatterer Op0 = scatter(&EEI, EEI.getOperand(0), *VS); Value *ExtIdx = EEI.getOperand(1); if (auto *CI = dyn_cast(ExtIdx)) { - Value *Res = Op0[CI->getValue().getZExtValue()]; + unsigned Idx = CI->getZExtValue(); + unsigned Fragment = Idx / VS->NumPacked; + Value *Res = Op0[Fragment]; + bool IsPacked = VS->NumPacked > 1; + if (Fragment == VS->NumFragments - 1 && VS->RemainderTy && + !VS->RemainderTy->isVectorTy()) + IsPacked = false; + if (IsPacked) + Res = Builder.CreateExtractElement(Res, Idx % VS->NumPacked); replaceUses(&EEI, Res); return true; } - if (!ScalarizeVariableInsertExtract) + // Never split a variable extractelement that isn't fully scalarized. + if (!ScalarizeVariableInsertExtract || VS->NumPacked > 1) return false; - Value *Res = PoisonValue::get(VT->getElementType()); - for (unsigned I = 0; I < NumSrcElems; ++I) { + Value *Res = PoisonValue::get(VS->VecTy->getElementType()); + for (unsigned I = 0; I < VS->NumFragments; ++I) { Value *ShouldExtract = Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I), ExtIdx->getName() + ".is." + Twine(I)); @@ -868,51 +1102,52 @@ bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) { } bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) { - auto *VT = dyn_cast(SVI.getType()); - if (!VT) + std::optional VS = getVectorSplit(SVI.getType()); + std::optional VSOp = + getVectorSplit(SVI.getOperand(0)->getType()); + if (!VS || !VSOp || VS->NumPacked > 1 || VSOp->NumPacked > 1) return false; - unsigned NumElems = VT->getNumElements(); - Scatterer Op0 = scatter(&SVI, SVI.getOperand(0)); - Scatterer Op1 = scatter(&SVI, SVI.getOperand(1)); + Scatterer Op0 = scatter(&SVI, SVI.getOperand(0), *VSOp); + Scatterer Op1 = scatter(&SVI, SVI.getOperand(1), *VSOp); ValueVector Res; - Res.resize(NumElems); + Res.resize(VS->NumFragments); - for (unsigned I = 0; I < NumElems; ++I) { + for (unsigned I = 0; I < VS->NumFragments; ++I) { int Selector = SVI.getMaskValue(I); if (Selector < 0) - Res[I] = UndefValue::get(VT->getElementType()); + Res[I] = UndefValue::get(VS->VecTy->getElementType()); else if (unsigned(Selector) < Op0.size()) Res[I] = Op0[Selector]; else Res[I] = Op1[Selector - Op0.size()]; } - gather(&SVI, Res); + gather(&SVI, Res, *VS); return true; } bool ScalarizerVisitor::visitPHINode(PHINode &PHI) { - auto *VT = dyn_cast(PHI.getType()); - if (!VT) + std::optional VS = getVectorSplit(PHI.getType()); + if (!VS) return false; - unsigned NumElems = cast(VT)->getNumElements(); IRBuilder<> Builder(&PHI); ValueVector Res; - Res.resize(NumElems); + Res.resize(VS->NumFragments); unsigned NumOps = PHI.getNumOperands(); - for (unsigned I = 0; I < NumElems; ++I) - Res[I] = Builder.CreatePHI(VT->getElementType(), NumOps, + for (unsigned I = 0; I < VS->NumFragments; ++I) { + Res[I] = Builder.CreatePHI(VS->getFragmentType(I), NumOps, PHI.getName() + ".i" + Twine(I)); + } for (unsigned I = 0; I < NumOps; ++I) { - Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I)); + Scatterer Op = scatter(&PHI, PHI.getIncomingValue(I), *VS); BasicBlock *IncomingBlock = PHI.getIncomingBlock(I); - for (unsigned J = 0; J < NumElems; ++J) + for (unsigned J = 0; J < VS->NumFragments; ++J) cast(Res[J])->addIncoming(Op[J], IncomingBlock); } - gather(&PHI, Res); + gather(&PHI, Res, *VS); return true; } @@ -927,17 +1162,17 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) { if (!Layout) return false; - unsigned NumElems = cast(Layout->VecTy)->getNumElements(); IRBuilder<> Builder(&LI); - Scatterer Ptr = scatter(&LI, LI.getPointerOperand(), LI.getType()); + Scatterer Ptr = scatter(&LI, LI.getPointerOperand(), Layout->VS); ValueVector Res; - Res.resize(NumElems); + Res.resize(Layout->VS.NumFragments); - for (unsigned I = 0; I < NumElems; ++I) - Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I], - Align(Layout->getElemAlign(I)), + for (unsigned I = 0; I < Layout->VS.NumFragments; ++I) { + Res[I] = Builder.CreateAlignedLoad(Layout->VS.getFragmentType(I), Ptr[I], + Align(Layout->getFragmentAlign(I)), LI.getName() + ".i" + Twine(I)); - gather(&LI, Res); + } + gather(&LI, Res, Layout->VS); return true; } @@ -953,17 +1188,17 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) { if (!Layout) return false; - unsigned NumElems = cast(Layout->VecTy)->getNumElements(); IRBuilder<> Builder(&SI); - Scatterer VPtr = scatter(&SI, SI.getPointerOperand(), FullValue->getType()); - Scatterer VVal = scatter(&SI, FullValue); + Scatterer VPtr = scatter(&SI, SI.getPointerOperand(), Layout->VS); + Scatterer VVal = scatter(&SI, FullValue, Layout->VS); ValueVector Stores; - Stores.resize(NumElems); - for (unsigned I = 0; I < NumElems; ++I) { + Stores.resize(Layout->VS.NumFragments); + for (unsigned I = 0; I < Layout->VS.NumFragments; ++I) { Value *Val = VVal[I]; Value *Ptr = VPtr[I]; - Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I)); + Stores[I] = + Builder.CreateAlignedStore(Val, Ptr, Layout->getFragmentAlign(I)); } transferMetadataAndIRFlags(&SI, Stores); return true; @@ -991,17 +1226,19 @@ bool ScalarizerVisitor::finish() { ValueVector &CV = *GMI.second; if (!Op->use_empty()) { // The value is still needed, so recreate it using a series of - // InsertElements. - Value *Res = PoisonValue::get(Op->getType()); + // insertelements and/or shufflevectors. + Value *Res; if (auto *Ty = dyn_cast(Op->getType())) { BasicBlock *BB = Op->getParent(); - unsigned Count = Ty->getNumElements(); IRBuilder<> Builder(Op); if (isa(Op)) Builder.SetInsertPoint(BB, BB->getFirstInsertionPt()); - for (unsigned I = 0; I < Count; ++I) - Res = Builder.CreateInsertElement(Res, CV[I], I, - Op->getName() + ".upto" + Twine(I)); + + VectorSplit VS = *getVectorSplit(Ty); + assert(VS.NumFragments == CV.size()); + + Res = concatenate(Builder, CV, VS, Op->getName()); + Res->takeName(Op); } else { assert(CV.size() == 1 && Op->getType() == CV[0]->getType()); diff --git a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll index 7ea0241b167e2..bbcdcb6f58674 100644 --- a/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll +++ b/llvm/test/Transforms/Scalarizer/basic-inseltpoison.ll @@ -559,17 +559,17 @@ define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i, ; CHECK-NEXT: [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1 ; CHECK-NEXT: [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2 ; CHECK-NEXT: [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3 -; CHECK-NEXT: [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0 ; CHECK-NEXT: [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR:%.*]], i64 0 +; CHECK-NEXT: [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0 ; CHECK-NEXT: [[VAL_I0:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I0]], i32 0, i32 [[I_I0]] -; CHECK-NEXT: [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1 ; CHECK-NEXT: [[PTR_I1:%.*]] = extractelement <4 x ptr> [[PTR]], i64 1 +; CHECK-NEXT: [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1 ; CHECK-NEXT: [[VAL_I1:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I1]], i32 1, i32 [[I_I1]] -; CHECK-NEXT: [[I_I2:%.*]] = extractelement <4 x i32> [[I]], i64 2 ; CHECK-NEXT: [[PTR_I2:%.*]] = extractelement <4 x ptr> [[PTR]], i64 2 +; CHECK-NEXT: [[I_I2:%.*]] = extractelement <4 x i32> [[I]], i64 2 ; CHECK-NEXT: [[VAL_I2:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I2]], i32 2, i32 [[I_I2]] -; CHECK-NEXT: [[I_I3:%.*]] = extractelement <4 x i32> [[I]], i64 3 ; CHECK-NEXT: [[PTR_I3:%.*]] = extractelement <4 x ptr> [[PTR]], i64 3 +; CHECK-NEXT: [[I_I3:%.*]] = extractelement <4 x i32> [[I]], i64 3 ; CHECK-NEXT: [[VAL_I3:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I3]], i32 3, i32 [[I_I3]] ; CHECK-NEXT: store ptr [[VAL_I0]], ptr [[DEST]], align 32 ; CHECK-NEXT: store ptr [[VAL_I1]], ptr [[DEST_I1]], align 8 diff --git a/llvm/test/Transforms/Scalarizer/basic.ll b/llvm/test/Transforms/Scalarizer/basic.ll index ddc942f6488d7..db7c5f535f7e9 100644 --- a/llvm/test/Transforms/Scalarizer/basic.ll +++ b/llvm/test/Transforms/Scalarizer/basic.ll @@ -559,17 +559,17 @@ define void @f13(ptr %dest, <4 x ptr> %ptr, <4 x i32> %i, ; CHECK-NEXT: [[DEST_I1:%.*]] = getelementptr ptr, ptr [[DEST:%.*]], i32 1 ; CHECK-NEXT: [[DEST_I2:%.*]] = getelementptr ptr, ptr [[DEST]], i32 2 ; CHECK-NEXT: [[DEST_I3:%.*]] = getelementptr ptr, ptr [[DEST]], i32 3 -; CHECK-NEXT: [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0 ; CHECK-NEXT: [[PTR_I0:%.*]] = extractelement <4 x ptr> [[PTR:%.*]], i64 0 +; CHECK-NEXT: [[I_I0:%.*]] = extractelement <4 x i32> [[I:%.*]], i64 0 ; CHECK-NEXT: [[VAL_I0:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I0]], i32 0, i32 [[I_I0]] -; CHECK-NEXT: [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1 ; CHECK-NEXT: [[PTR_I1:%.*]] = extractelement <4 x ptr> [[PTR]], i64 1 +; CHECK-NEXT: [[I_I1:%.*]] = extractelement <4 x i32> [[I]], i64 1 ; CHECK-NEXT: [[VAL_I1:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I1]], i32 1, i32 [[I_I1]] -; CHECK-NEXT: [[I_I2:%.*]] = extractelement <4 x i32> [[I]], i64 2 ; CHECK-NEXT: [[PTR_I2:%.*]] = extractelement <4 x ptr> [[PTR]], i64 2 +; CHECK-NEXT: [[I_I2:%.*]] = extractelement <4 x i32> [[I]], i64 2 ; CHECK-NEXT: [[VAL_I2:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I2]], i32 2, i32 [[I_I2]] -; CHECK-NEXT: [[I_I3:%.*]] = extractelement <4 x i32> [[I]], i64 3 ; CHECK-NEXT: [[PTR_I3:%.*]] = extractelement <4 x ptr> [[PTR]], i64 3 +; CHECK-NEXT: [[I_I3:%.*]] = extractelement <4 x i32> [[I]], i64 3 ; CHECK-NEXT: [[VAL_I3:%.*]] = getelementptr inbounds [4 x float], ptr [[PTR_I3]], i32 3, i32 [[I_I3]] ; CHECK-NEXT: store ptr [[VAL_I0]], ptr [[DEST]], align 32 ; CHECK-NEXT: store ptr [[VAL_I1]], ptr [[DEST_I1]], align 8 diff --git a/llvm/test/Transforms/Scalarizer/min-bits.ll b/llvm/test/Transforms/Scalarizer/min-bits.ll index 1ad5fe37e8cad..be901d4990592 100644 --- a/llvm/test/Transforms/Scalarizer/min-bits.ll +++ b/llvm/test/Transforms/Scalarizer/min-bits.ll @@ -1,20 +1,28 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt %s -passes='function(scalarizer,dce)' -scalarize-load-store -S | FileCheck %s --check-prefixes=CHECK +; RUN: opt %s -passes='function(scalarizer,dce)' -scalarize-load-store -scalarize-min-bits=16 -S | FileCheck %s --check-prefixes=CHECK,MIN16 +; RUN: opt %s -passes='function(scalarizer,dce)' -scalarize-load-store -scalarize-min-bits=32 -S | FileCheck %s --check-prefixes=CHECK,MIN32 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" define void @load_add_store_v2i16(ptr %pa, ptr %pb) { -; CHECK-LABEL: @load_add_store_v2i16( -; CHECK-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 -; CHECK-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 -; CHECK-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] -; CHECK-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 -; CHECK-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @load_add_store_v2i16( +; MIN16-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 +; MIN16-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 +; MIN16-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] +; MIN16-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 +; MIN16-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @load_add_store_v2i16( +; MIN32-NEXT: [[A:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[B:%.*]] = load <2 x i16>, ptr [[PB:%.*]], align 8 +; MIN32-NEXT: [[C:%.*]] = add <2 x i16> [[A]], [[B]] +; MIN32-NEXT: store <2 x i16> [[C]], ptr [[PA]], align 8 +; MIN32-NEXT: ret void ; %a = load <2 x i16>, ptr %pa, align 8 %b = load <2 x i16>, ptr %pb, align 8 @@ -24,24 +32,37 @@ define void @load_add_store_v2i16(ptr %pa, ptr %pb) { } define void @load_add_store_v3i16(ptr %pa, ptr %pb) { -; CHECK-LABEL: @load_add_store_v3i16( -; CHECK-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 -; CHECK-NEXT: [[PB_I2:%.*]] = getelementptr i16, ptr [[PB]], i32 2 -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 -; CHECK-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 -; CHECK-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 -; CHECK-NEXT: [[B_I2:%.*]] = load i16, ptr [[PB_I2]], align 4 -; CHECK-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[C_I2:%.*]] = add i16 [[A_I2]], [[B_I2]] -; CHECK-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 -; CHECK-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 -; CHECK-NEXT: store i16 [[C_I2]], ptr [[PA_I2]], align 4 -; CHECK-NEXT: ret void +; MIN16-LABEL: @load_add_store_v3i16( +; MIN16-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 +; MIN16-NEXT: [[PB_I2:%.*]] = getelementptr i16, ptr [[PB]], i32 2 +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 +; MIN16-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 +; MIN16-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 +; MIN16-NEXT: [[B_I2:%.*]] = load i16, ptr [[PB_I2]], align 4 +; MIN16-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[C_I2:%.*]] = add i16 [[A_I2]], [[B_I2]] +; MIN16-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 +; MIN16-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 +; MIN16-NEXT: store i16 [[C_I2]], ptr [[PA_I2]], align 4 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @load_add_store_v3i16( +; MIN32-NEXT: [[PB_I1:%.*]] = getelementptr <2 x i16>, ptr [[PB:%.*]], i32 1 +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[PA_I1:%.*]] = getelementptr <2 x i16>, ptr [[PA]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 4 +; MIN32-NEXT: [[B_I0:%.*]] = load <2 x i16>, ptr [[PB]], align 8 +; MIN32-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 4 +; MIN32-NEXT: [[C_I0:%.*]] = add <2 x i16> [[A_I0]], [[B_I0]] +; MIN32-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] +; MIN32-NEXT: store <2 x i16> [[C_I0]], ptr [[PA]], align 8 +; MIN32-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 4 +; MIN32-NEXT: ret void ; %a = load <3 x i16>, ptr %pa, align 8 %b = load <3 x i16>, ptr %pb, align 8 @@ -51,30 +72,43 @@ define void @load_add_store_v3i16(ptr %pa, ptr %pb) { } define void @load_add_store_v4i16(ptr %pa, ptr %pb) { -; CHECK-LABEL: @load_add_store_v4i16( -; CHECK-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 -; CHECK-NEXT: [[PB_I2:%.*]] = getelementptr i16, ptr [[PB]], i32 2 -; CHECK-NEXT: [[PB_I3:%.*]] = getelementptr i16, ptr [[PB]], i32 3 -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 -; CHECK-NEXT: [[PA_I3:%.*]] = getelementptr i16, ptr [[PA]], i32 3 -; CHECK-NEXT: [[A_I3:%.*]] = load i16, ptr [[PA_I3]], align 2 -; CHECK-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 -; CHECK-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 -; CHECK-NEXT: [[B_I2:%.*]] = load i16, ptr [[PB_I2]], align 4 -; CHECK-NEXT: [[B_I3:%.*]] = load i16, ptr [[PB_I3]], align 2 -; CHECK-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[C_I2:%.*]] = add i16 [[A_I2]], [[B_I2]] -; CHECK-NEXT: [[C_I3:%.*]] = add i16 [[A_I3]], [[B_I3]] -; CHECK-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 -; CHECK-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 -; CHECK-NEXT: store i16 [[C_I2]], ptr [[PA_I2]], align 4 -; CHECK-NEXT: store i16 [[C_I3]], ptr [[PA_I3]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @load_add_store_v4i16( +; MIN16-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 +; MIN16-NEXT: [[PB_I2:%.*]] = getelementptr i16, ptr [[PB]], i32 2 +; MIN16-NEXT: [[PB_I3:%.*]] = getelementptr i16, ptr [[PB]], i32 3 +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 +; MIN16-NEXT: [[PA_I3:%.*]] = getelementptr i16, ptr [[PA]], i32 3 +; MIN16-NEXT: [[A_I3:%.*]] = load i16, ptr [[PA_I3]], align 2 +; MIN16-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 8 +; MIN16-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 +; MIN16-NEXT: [[B_I2:%.*]] = load i16, ptr [[PB_I2]], align 4 +; MIN16-NEXT: [[B_I3:%.*]] = load i16, ptr [[PB_I3]], align 2 +; MIN16-NEXT: [[C_I0:%.*]] = add i16 [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[C_I1:%.*]] = add i16 [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[C_I2:%.*]] = add i16 [[A_I2]], [[B_I2]] +; MIN16-NEXT: [[C_I3:%.*]] = add i16 [[A_I3]], [[B_I3]] +; MIN16-NEXT: store i16 [[C_I0]], ptr [[PA]], align 8 +; MIN16-NEXT: store i16 [[C_I1]], ptr [[PA_I1]], align 2 +; MIN16-NEXT: store i16 [[C_I2]], ptr [[PA_I2]], align 4 +; MIN16-NEXT: store i16 [[C_I3]], ptr [[PA_I3]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @load_add_store_v4i16( +; MIN32-NEXT: [[PB_I1:%.*]] = getelementptr <2 x i16>, ptr [[PB:%.*]], i32 1 +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[PA_I1:%.*]] = getelementptr <2 x i16>, ptr [[PA]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load <2 x i16>, ptr [[PA_I1]], align 4 +; MIN32-NEXT: [[B_I0:%.*]] = load <2 x i16>, ptr [[PB]], align 8 +; MIN32-NEXT: [[B_I1:%.*]] = load <2 x i16>, ptr [[PB_I1]], align 4 +; MIN32-NEXT: [[C_I0:%.*]] = add <2 x i16> [[A_I0]], [[B_I0]] +; MIN32-NEXT: [[C_I1:%.*]] = add <2 x i16> [[A_I1]], [[B_I1]] +; MIN32-NEXT: store <2 x i16> [[C_I0]], ptr [[PA]], align 8 +; MIN32-NEXT: store <2 x i16> [[C_I1]], ptr [[PA_I1]], align 4 +; MIN32-NEXT: ret void ; %a = load <4 x i16>, ptr %pa, align 8 %b = load <4 x i16>, ptr %pb, align 8 @@ -83,61 +117,132 @@ define void @load_add_store_v4i16(ptr %pa, ptr %pb) { ret void } +define void @load_add_store_v4i10(ptr %pa, ptr %pb) { +; MIN16-LABEL: @load_add_store_v4i10( +; MIN16-NEXT: [[A:%.*]] = load <4 x i10>, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x i10> [[A]], i64 0 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x i10> [[A]], i64 1 +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x i10> [[A]], i64 2 +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x i10> [[A]], i64 3 +; MIN16-NEXT: [[B:%.*]] = load <4 x i10>, ptr [[PB:%.*]], align 8 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <4 x i10> [[B]], i64 0 +; MIN16-NEXT: [[C_I0:%.*]] = add i10 [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <4 x i10> [[B]], i64 1 +; MIN16-NEXT: [[C_I1:%.*]] = add i10 [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <4 x i10> [[B]], i64 2 +; MIN16-NEXT: [[C_I2:%.*]] = add i10 [[A_I2]], [[B_I2]] +; MIN16-NEXT: [[B_I3:%.*]] = extractelement <4 x i10> [[B]], i64 3 +; MIN16-NEXT: [[C_I3:%.*]] = add i10 [[A_I3]], [[B_I3]] +; MIN16-NEXT: [[C_UPTO0:%.*]] = insertelement <4 x i10> poison, i10 [[C_I0]], i64 0 +; MIN16-NEXT: [[C_UPTO1:%.*]] = insertelement <4 x i10> [[C_UPTO0]], i10 [[C_I1]], i64 1 +; MIN16-NEXT: [[C_UPTO2:%.*]] = insertelement <4 x i10> [[C_UPTO1]], i10 [[C_I2]], i64 2 +; MIN16-NEXT: [[C:%.*]] = insertelement <4 x i10> [[C_UPTO2]], i10 [[C_I3]], i64 3 +; MIN16-NEXT: store <4 x i10> [[C]], ptr [[PA]], align 8 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @load_add_store_v4i10( +; MIN32-NEXT: [[A:%.*]] = load <4 x i10>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x i10> [[A]], <4 x i10> poison, <3 x i32> +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <4 x i10> [[A]], i64 3 +; MIN32-NEXT: [[B:%.*]] = load <4 x i10>, ptr [[PB:%.*]], align 8 +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <4 x i10> [[B]], <4 x i10> poison, <3 x i32> +; MIN32-NEXT: [[C_I0:%.*]] = add <3 x i10> [[A_I0]], [[B_I0]] +; MIN32-NEXT: [[B_I1:%.*]] = extractelement <4 x i10> [[B]], i64 3 +; MIN32-NEXT: [[C_I1:%.*]] = add i10 [[A_I1]], [[B_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <3 x i10> [[C_I0]], <3 x i10> [[C_I0]], <4 x i32> +; MIN32-NEXT: [[C:%.*]] = insertelement <4 x i10> [[TMP1]], i10 [[C_I1]], i64 3 +; MIN32-NEXT: store <4 x i10> [[C]], ptr [[PA]], align 8 +; MIN32-NEXT: ret void +; + %a = load <4 x i10>, ptr %pa, align 8 + %b = load <4 x i10>, ptr %pb, align 8 + %c = add <4 x i10> %a, %b + store <4 x i10> %c, ptr %pa, align 8 + ret void +} + define <2 x half> @select_uniform_condition_v2f16(<2 x half> %a, <2 x half> %b, i1 %cc) { -; CHECK-LABEL: @select_uniform_condition_v2f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: ret <2 x half> [[R]] +; MIN16-LABEL: @select_uniform_condition_v2f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: ret <2 x half> [[R]] +; +; MIN32-LABEL: @select_uniform_condition_v2f16( +; MIN32-NEXT: [[R:%.*]] = select i1 [[CC:%.*]], <2 x half> [[A:%.*]], <2 x half> [[B:%.*]] +; MIN32-NEXT: ret <2 x half> [[R]] ; %r = select i1 %cc, <2 x half> %a, <2 x half> %b ret <2 x half> %r } define <3 x half> @select_uniform_condition_v3f16(<3 x half> %a, <3 x half> %b, i1 %cc) { -; CHECK-LABEL: @select_uniform_condition_v3f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = select i1 [[CC]], half [[A_I2]], half [[B_I2]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: ret <3 x half> [[R]] +; MIN16-LABEL: @select_uniform_condition_v3f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = select i1 [[CC]], half [[A_I2]], half [[B_I2]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: ret <3 x half> [[R]] +; +; MIN32-LABEL: @select_uniform_condition_v3f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x half> [[A:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <3 x half> [[B:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], <2 x half> [[A_I0]], <2 x half> [[B_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN32-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN32-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x half> [[TMP1]], half [[R_I1]], i64 2 +; MIN32-NEXT: ret <3 x half> [[R]] ; %r = select i1 %cc, <3 x half> %a, <3 x half> %b ret <3 x half> %r } define <4 x half> @select_uniform_condition_v4f16(<4 x half> %a, <4 x half> %b, i1 %cc) { -; CHECK-LABEL: @select_uniform_condition_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = select i1 [[CC]], half [[A_I2]], half [[B_I2]] -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = select i1 [[CC]], half [[A_I3]], half [[B_I3]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x half> [[R]] +; MIN16-LABEL: @select_uniform_condition_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], half [[A_I0]], half [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = select i1 [[CC]], half [[A_I1]], half [[B_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = select i1 [[CC]], half [[A_I2]], half [[B_I2]] +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = select i1 [[CC]], half [[A_I3]], half [[B_I3]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x half> [[R]] +; +; MIN32-LABEL: @select_uniform_condition_v4f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <4 x half> [[B:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = select i1 [[CC:%.*]], <2 x half> [[A_I0]], <2 x half> [[B_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I1:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I1:%.*]] = select i1 [[CC]], <2 x half> [[A_I1]], <2 x half> [[B_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[R_I1]], <2 x half> [[R_I1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x i32> +; MIN32-NEXT: ret <4 x half> [[R]] ; %r = select i1 %cc, <4 x half> %a, <4 x half> %b ret <4 x half> %r @@ -145,26 +250,7 @@ define <4 x half> @select_uniform_condition_v4f16(<4 x half> %a, <4 x half> %b, define <4 x half> @select_vector_condition_v4f16(<4 x half> %a, <4 x half> %b, <4 x i1> %cc) { ; CHECK-LABEL: @select_vector_condition_v4f16( -; CHECK-NEXT: [[CC_I0:%.*]] = extractelement <4 x i1> [[CC:%.*]], i64 0 -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = select i1 [[CC_I0]], half [[A_I0]], half [[B_I0]] -; CHECK-NEXT: [[CC_I1:%.*]] = extractelement <4 x i1> [[CC]], i64 1 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = select i1 [[CC_I1]], half [[A_I1]], half [[B_I1]] -; CHECK-NEXT: [[CC_I2:%.*]] = extractelement <4 x i1> [[CC]], i64 2 -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = select i1 [[CC_I2]], half [[A_I2]], half [[B_I2]] -; CHECK-NEXT: [[CC_I3:%.*]] = extractelement <4 x i1> [[CC]], i64 3 -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = select i1 [[CC_I3]], half [[A_I3]], half [[B_I3]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 +; CHECK-NEXT: [[R:%.*]] = select <4 x i1> [[CC:%.*]], <4 x half> [[A:%.*]], <4 x half> [[B:%.*]] ; CHECK-NEXT: ret <4 x half> [[R]] ; %r = select <4 x i1> %cc, <4 x half> %a, <4 x half> %b @@ -172,182 +258,259 @@ define <4 x half> @select_vector_condition_v4f16(<4 x half> %a, <4 x half> %b, < } define <2 x half> @unary_v2f16(<2 x half> %a) { -; CHECK-LABEL: @unary_v2f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: ret <2 x half> [[R]] +; MIN16-LABEL: @unary_v2f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: ret <2 x half> [[R]] +; +; MIN32-LABEL: @unary_v2f16( +; MIN32-NEXT: [[R:%.*]] = fneg <2 x half> [[A:%.*]] +; MIN32-NEXT: ret <2 x half> [[R]] ; %r = fneg <2 x half> %a ret <2 x half> %r } define <3 x half> @unary_v3f16(<3 x half> %a) { -; CHECK-LABEL: @unary_v3f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fneg half [[A_I2]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: ret <3 x half> [[R]] +; MIN16-LABEL: @unary_v3f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fneg half [[A_I2]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: ret <3 x half> [[R]] +; +; MIN32-LABEL: @unary_v3f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x half> [[A:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fneg <2 x half> [[A_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN32-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x half> [[TMP1]], half [[R_I1]], i64 2 +; MIN32-NEXT: ret <3 x half> [[R]] ; %r = fneg <3 x half> %a ret <3 x half> %r } define <4 x half> @unary_v4f16(<4 x half> %a) { -; CHECK-LABEL: @unary_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fneg half [[A_I2]] -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = fneg half [[A_I3]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x half> [[R]] +; MIN16-LABEL: @unary_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fneg half [[A_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fneg half [[A_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fneg half [[A_I2]] +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = fneg half [[A_I3]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x half> [[R]] +; +; MIN32-LABEL: @unary_v4f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fneg <2 x half> [[A_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I1:%.*]] = fneg <2 x half> [[A_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[R_I1]], <2 x half> [[R_I1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x i32> +; MIN32-NEXT: ret <4 x half> [[R]] ; %r = fneg <4 x half> %a ret <4 x half> %r } define <2 x half> @binary_v2f16(<2 x half> %a, <2 x half> %b) { -; CHECK-LABEL: @binary_v2f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: ret <2 x half> [[R]] +; MIN16-LABEL: @binary_v2f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: ret <2 x half> [[R]] +; +; MIN32-LABEL: @binary_v2f16( +; MIN32-NEXT: [[R:%.*]] = fadd <2 x half> [[A:%.*]], [[B:%.*]] +; MIN32-NEXT: ret <2 x half> [[R]] ; %r = fadd <2 x half> %a, %b ret <2 x half> %r } define <3 x half> @binary_v3f16(<3 x half> %a, <3 x half> %b) { -; CHECK-LABEL: @binary_v3f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fadd half [[A_I2]], [[B_I2]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: ret <3 x half> [[R]] +; MIN16-LABEL: @binary_v3f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fadd half [[A_I2]], [[B_I2]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: ret <3 x half> [[R]] +; +; MIN32-LABEL: @binary_v3f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x half> [[A:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <3 x half> [[B:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fadd <2 x half> [[A_I0]], [[B_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN32-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN32-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x half> [[TMP1]], half [[R_I1]], i64 2 +; MIN32-NEXT: ret <3 x half> [[R]] ; %r = fadd <3 x half> %a, %b ret <3 x half> %r } define <4 x half> @binary_v4f16(<4 x half> %a, <4 x half> %b) { -; CHECK-LABEL: @binary_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fadd half [[A_I2]], [[B_I2]] -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = fadd half [[A_I3]], [[B_I3]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x half> [[R]] +; MIN16-LABEL: @binary_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fadd half [[A_I0]], [[B_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fadd half [[A_I1]], [[B_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fadd half [[A_I2]], [[B_I2]] +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = fadd half [[A_I3]], [[B_I3]] +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x half> [[R]] +; +; MIN32-LABEL: @binary_v4f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <4 x half> [[B:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fadd <2 x half> [[A_I0]], [[B_I0]] +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I1:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I1:%.*]] = fadd <2 x half> [[A_I1]], [[B_I1]] +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[R_I1]], <2 x half> [[R_I1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x i32> +; MIN32-NEXT: ret <4 x half> [[R]] ; %r = fadd <4 x half> %a, %b ret <4 x half> %r } define <2 x i16> @fptosi_v2f16(<2 x half> %a) { -; CHECK-LABEL: @fptosi_v2f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x i16> poison, i16 [[R_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 -; CHECK-NEXT: ret <2 x i16> [[R]] +; MIN16-LABEL: @fptosi_v2f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x i16> poison, i16 [[R_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 +; MIN16-NEXT: ret <2 x i16> [[R]] +; +; MIN32-LABEL: @fptosi_v2f16( +; MIN32-NEXT: [[R:%.*]] = fptosi <2 x half> [[A:%.*]] to <2 x i16> +; MIN32-NEXT: ret <2 x i16> [[R]] ; %r = fptosi <2 x half> %a to <2 x i16> ret <2 x i16> %r } define <3 x i16> @fptosi_v3f16(<3 x half> %a) { -; CHECK-LABEL: @fptosi_v3f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fptosi half [[A_I2]] to i16 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x i16> poison, i16 [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x i16> [[R_UPTO1]], i16 [[R_I2]], i64 2 -; CHECK-NEXT: ret <3 x i16> [[R]] +; MIN16-LABEL: @fptosi_v3f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fptosi half [[A_I2]] to i16 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x i16> poison, i16 [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x i16> [[R_UPTO1]], i16 [[R_I2]], i64 2 +; MIN16-NEXT: ret <3 x i16> [[R]] +; +; MIN32-LABEL: @fptosi_v3f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x half> [[A:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fptosi <2 x half> [[A_I0]] to <2 x i16> +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN32-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[R_I0]], <2 x i16> [[R_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x i16> [[TMP1]], i16 [[R_I1]], i64 2 +; MIN32-NEXT: ret <3 x i16> [[R]] ; %r = fptosi <3 x half> %a to <3 x i16> ret <3 x i16> %r } define <4 x i16> @fptosi_v4f16(<4 x half> %a) { -; CHECK-LABEL: @fptosi_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fptosi half [[A_I2]] to i16 -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = fptosi half [[A_I3]] to i16 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x i16> poison, i16 [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x i16> [[R_UPTO1]], i16 [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i16> [[R_UPTO2]], i16 [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x i16> [[R]] +; MIN16-LABEL: @fptosi_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fptosi half [[A_I0]] to i16 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fptosi half [[A_I1]] to i16 +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fptosi half [[A_I2]] to i16 +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = fptosi half [[A_I3]] to i16 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x i16> poison, i16 [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x i16> [[R_UPTO0]], i16 [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x i16> [[R_UPTO1]], i16 [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x i16> [[R_UPTO2]], i16 [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x i16> [[R]] +; +; MIN32-LABEL: @fptosi_v4f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = fptosi <2 x half> [[A_I0]] to <2 x i16> +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I1:%.*]] = fptosi <2 x half> [[A_I1]] to <2 x i16> +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[R_I0]], <2 x i16> [[R_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[R_I1]], <2 x i16> [[R_I1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i32> +; MIN32-NEXT: ret <4 x i16> [[R]] ; %r = fptosi <4 x half> %a to <4 x i16> ret <4 x i16> %r } define <4 x float> @fpext_v4f16(<4 x half> %a) { -; CHECK-LABEL: @fpext_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = fpext half [[A_I0]] to float -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = fpext half [[A_I1]] to float -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = fpext half [[A_I2]] to float -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = fpext half [[A_I3]] to float -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x float> poison, float [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x float> [[R_UPTO0]], float [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x float> [[R_UPTO1]], float [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[R_UPTO2]], float [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x float> [[R]] +; MIN16-LABEL: @fpext_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = fpext half [[A_I0]] to float +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = fpext half [[A_I1]] to float +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = fpext half [[A_I2]] to float +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = fpext half [[A_I3]] to float +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x float> poison, float [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x float> [[R_UPTO0]], float [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x float> [[R_UPTO1]], float [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x float> [[R_UPTO2]], float [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x float> [[R]] +; +; MIN32-LABEL: @fpext_v4f16( +; MIN32-NEXT: [[R:%.*]] = fpext <4 x half> [[A:%.*]] to <4 x float> +; MIN32-NEXT: ret <4 x float> [[R]] ; %r = fpext <4 x half> %a to <4 x float> ret <4 x float> %r @@ -355,22 +518,7 @@ define <4 x float> @fpext_v4f16(<4 x half> %a) { define <4 x i1> @icmp_v4f16(<4 x i16> %a, <4 x i16> %b) { ; CHECK-LABEL: @icmp_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <4 x i16> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = icmp ugt i16 [[A_I0]], [[B_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <4 x i16> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = icmp ugt i16 [[A_I1]], [[B_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <4 x i16> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = icmp ugt i16 [[A_I2]], [[B_I2]] -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x i16> [[A]], i64 3 -; CHECK-NEXT: [[B_I3:%.*]] = extractelement <4 x i16> [[B]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = icmp ugt i16 [[A_I3]], [[B_I3]] -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x i1> poison, i1 [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x i1> [[R_UPTO0]], i1 [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x i1> [[R_UPTO1]], i1 [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i1> [[R_UPTO2]], i1 [[R_I3]], i64 3 +; CHECK-NEXT: [[R:%.*]] = icmp ugt <4 x i16> [[A:%.*]], [[B:%.*]] ; CHECK-NEXT: ret <4 x i1> [[R]] ; %r = icmp ugt <4 x i16> %a, %b @@ -378,26 +526,24 @@ define <4 x i1> @icmp_v4f16(<4 x i16> %a, <4 x i16> %b) { } define <4 x ptr> @gep1_v4(ptr %base, <4 x i16> %a) { -; CHECK-LABEL: @gep1_v4( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x i16> [[A]], i64 3 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[BASE:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT_I0:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 0 -; CHECK-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[DOTSPLAT_I0]], i16 [[A_I0]] -; CHECK-NEXT: [[DOTSPLAT_I1:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 1 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[DOTSPLAT_I1]], i16 [[A_I1]] -; CHECK-NEXT: [[DOTSPLAT_I2:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 2 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[DOTSPLAT_I2]], i16 [[A_I2]] -; CHECK-NEXT: [[DOTSPLAT_I3:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 3 -; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[DOTSPLAT_I3]], i16 [[A_I3]] -; CHECK-NEXT: [[P_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[P_I0]], i64 0 -; CHECK-NEXT: [[P_UPTO1:%.*]] = insertelement <4 x ptr> [[P_UPTO0]], ptr [[P_I1]], i64 1 -; CHECK-NEXT: [[P_UPTO2:%.*]] = insertelement <4 x ptr> [[P_UPTO1]], ptr [[P_I2]], i64 2 -; CHECK-NEXT: [[P:%.*]] = insertelement <4 x ptr> [[P_UPTO2]], ptr [[P_I3]], i64 3 -; CHECK-NEXT: ret <4 x ptr> [[P]] +; MIN16-LABEL: @gep1_v4( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 +; MIN16-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i16 [[A_I0]] +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[BASE]], i16 [[A_I1]] +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[BASE]], i16 [[A_I2]] +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x i16> [[A]], i64 3 +; MIN16-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[BASE]], i16 [[A_I3]] +; MIN16-NEXT: [[P_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[P_I0]], i64 0 +; MIN16-NEXT: [[P_UPTO1:%.*]] = insertelement <4 x ptr> [[P_UPTO0]], ptr [[P_I1]], i64 1 +; MIN16-NEXT: [[P_UPTO2:%.*]] = insertelement <4 x ptr> [[P_UPTO1]], ptr [[P_I2]], i64 2 +; MIN16-NEXT: [[P:%.*]] = insertelement <4 x ptr> [[P_UPTO2]], ptr [[P_I3]], i64 3 +; MIN16-NEXT: ret <4 x ptr> [[P]] +; +; MIN32-LABEL: @gep1_v4( +; MIN32-NEXT: [[P:%.*]] = getelementptr i32, ptr [[BASE:%.*]], <4 x i16> [[A:%.*]] +; MIN32-NEXT: ret <4 x ptr> [[P]] ; %p = getelementptr i32, ptr %base, <4 x i16> %a ret <4 x ptr> %p @@ -405,20 +551,14 @@ define <4 x ptr> @gep1_v4(ptr %base, <4 x i16> %a) { define <4 x ptr> @gep2_v4(<4 x ptr> %base, i16 %a) { ; CHECK-LABEL: @gep2_v4( -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[A:%.*]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT_I0:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 0 ; CHECK-NEXT: [[BASE_I0:%.*]] = extractelement <4 x ptr> [[BASE:%.*]], i64 0 -; CHECK-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[BASE_I0]], i16 [[DOTSPLAT_I0]] -; CHECK-NEXT: [[DOTSPLAT_I1:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 1 +; CHECK-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[BASE_I0]], i16 [[A:%.*]] ; CHECK-NEXT: [[BASE_I1:%.*]] = extractelement <4 x ptr> [[BASE]], i64 1 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[BASE_I1]], i16 [[DOTSPLAT_I1]] -; CHECK-NEXT: [[DOTSPLAT_I2:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 2 +; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[BASE_I1]], i16 [[A]] ; CHECK-NEXT: [[BASE_I2:%.*]] = extractelement <4 x ptr> [[BASE]], i64 2 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[BASE_I2]], i16 [[DOTSPLAT_I2]] -; CHECK-NEXT: [[DOTSPLAT_I3:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 3 +; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[BASE_I2]], i16 [[A]] ; CHECK-NEXT: [[BASE_I3:%.*]] = extractelement <4 x ptr> [[BASE]], i64 3 -; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[BASE_I3]], i16 [[DOTSPLAT_I3]] +; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[BASE_I3]], i16 [[A]] ; CHECK-NEXT: [[P_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[P_I0]], i64 0 ; CHECK-NEXT: [[P_UPTO1:%.*]] = insertelement <4 x ptr> [[P_UPTO0]], ptr [[P_I1]], i64 1 ; CHECK-NEXT: [[P_UPTO2:%.*]] = insertelement <4 x ptr> [[P_UPTO1]], ptr [[P_I2]], i64 2 @@ -430,36 +570,45 @@ define <4 x ptr> @gep2_v4(<4 x ptr> %base, i16 %a) { } define <4 x ptr> @gep3_v4(<4 x ptr> %base, <4 x i16> %a) { -; CHECK-LABEL: @gep3_v4( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: [[BASE_I0:%.*]] = extractelement <4 x ptr> [[BASE:%.*]], i64 0 -; CHECK-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[BASE_I0]], i16 [[A_I0]] -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 -; CHECK-NEXT: [[BASE_I1:%.*]] = extractelement <4 x ptr> [[BASE]], i64 1 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[BASE_I1]], i16 [[A_I1]] -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 -; CHECK-NEXT: [[BASE_I2:%.*]] = extractelement <4 x ptr> [[BASE]], i64 2 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[BASE_I2]], i16 [[A_I2]] -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x i16> [[A]], i64 3 -; CHECK-NEXT: [[BASE_I3:%.*]] = extractelement <4 x ptr> [[BASE]], i64 3 -; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[BASE_I3]], i16 [[A_I3]] -; CHECK-NEXT: [[P_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[P_I0]], i64 0 -; CHECK-NEXT: [[P_UPTO1:%.*]] = insertelement <4 x ptr> [[P_UPTO0]], ptr [[P_I1]], i64 1 -; CHECK-NEXT: [[P_UPTO2:%.*]] = insertelement <4 x ptr> [[P_UPTO1]], ptr [[P_I2]], i64 2 -; CHECK-NEXT: [[P:%.*]] = insertelement <4 x ptr> [[P_UPTO2]], ptr [[P_I3]], i64 3 -; CHECK-NEXT: ret <4 x ptr> [[P]] +; MIN16-LABEL: @gep3_v4( +; MIN16-NEXT: [[BASE_I0:%.*]] = extractelement <4 x ptr> [[BASE:%.*]], i64 0 +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 +; MIN16-NEXT: [[P_I0:%.*]] = getelementptr i32, ptr [[BASE_I0]], i16 [[A_I0]] +; MIN16-NEXT: [[BASE_I1:%.*]] = extractelement <4 x ptr> [[BASE]], i64 1 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[BASE_I1]], i16 [[A_I1]] +; MIN16-NEXT: [[BASE_I2:%.*]] = extractelement <4 x ptr> [[BASE]], i64 2 +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[BASE_I2]], i16 [[A_I2]] +; MIN16-NEXT: [[BASE_I3:%.*]] = extractelement <4 x ptr> [[BASE]], i64 3 +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x i16> [[A]], i64 3 +; MIN16-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[BASE_I3]], i16 [[A_I3]] +; MIN16-NEXT: [[P_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[P_I0]], i64 0 +; MIN16-NEXT: [[P_UPTO1:%.*]] = insertelement <4 x ptr> [[P_UPTO0]], ptr [[P_I1]], i64 1 +; MIN16-NEXT: [[P_UPTO2:%.*]] = insertelement <4 x ptr> [[P_UPTO1]], ptr [[P_I2]], i64 2 +; MIN16-NEXT: [[P:%.*]] = insertelement <4 x ptr> [[P_UPTO2]], ptr [[P_I3]], i64 3 +; MIN16-NEXT: ret <4 x ptr> [[P]] +; +; MIN32-LABEL: @gep3_v4( +; MIN32-NEXT: [[P:%.*]] = getelementptr i32, <4 x ptr> [[BASE:%.*]], <4 x i16> [[A:%.*]] +; MIN32-NEXT: ret <4 x ptr> [[P]] ; %p = getelementptr i32, <4 x ptr> %base, <4 x i16> %a ret <4 x ptr> %p } define void @insertelement_v2i16(ptr %p, <2 x i16> %a, i16 %b) { -; CHECK-LABEL: @insertelement_v2i16( -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: store i16 [[A_I0]], ptr [[P]], align 4 -; CHECK-NEXT: store i16 [[B:%.*]], ptr [[P_I1]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @insertelement_v2i16( +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x i16> [[A:%.*]], i64 0 +; MIN16-NEXT: store i16 [[A_I0]], ptr [[P]], align 4 +; MIN16-NEXT: store i16 [[B:%.*]], ptr [[P_I1]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @insertelement_v2i16( +; MIN32-NEXT: [[R:%.*]] = insertelement <2 x i16> [[A:%.*]], i16 [[B:%.*]], i64 1 +; MIN32-NEXT: store <2 x i16> [[R]], ptr [[P:%.*]], align 4 +; MIN32-NEXT: ret void ; %r = insertelement <2 x i16> %a, i16 %b, i64 1 store <2 x i16> %r, ptr %p @@ -467,15 +616,22 @@ define void @insertelement_v2i16(ptr %p, <2 x i16> %a, i16 %b) { } define void @insertelement_v3i16(ptr %p, <3 x i16> %a, i16 %b) { -; CHECK-LABEL: @insertelement_v3i16( -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i16, ptr [[P]], i32 2 -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x i16> [[A]], i64 1 -; CHECK-NEXT: store i16 [[A_I0]], ptr [[P]], align 8 -; CHECK-NEXT: store i16 [[A_I1]], ptr [[P_I1]], align 2 -; CHECK-NEXT: store i16 [[B:%.*]], ptr [[P_I2]], align 4 -; CHECK-NEXT: ret void +; MIN16-LABEL: @insertelement_v3i16( +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr i16, ptr [[P]], i32 2 +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x i16> [[A:%.*]], i64 0 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x i16> [[A]], i64 1 +; MIN16-NEXT: store i16 [[A_I0]], ptr [[P]], align 8 +; MIN16-NEXT: store i16 [[A_I1]], ptr [[P_I1]], align 2 +; MIN16-NEXT: store i16 [[B:%.*]], ptr [[P_I2]], align 4 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @insertelement_v3i16( +; MIN32-NEXT: [[P_I1:%.*]] = getelementptr <2 x i16>, ptr [[P:%.*]], i32 1 +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x i16> [[A:%.*]], <3 x i16> poison, <2 x i32> +; MIN32-NEXT: store <2 x i16> [[A_I0]], ptr [[P]], align 8 +; MIN32-NEXT: store i16 [[B:%.*]], ptr [[P_I1]], align 4 +; MIN32-NEXT: ret void ; %r = insertelement <3 x i16> %a, i16 %b, i64 2 store <3 x i16> %r, ptr %p @@ -483,18 +639,27 @@ define void @insertelement_v3i16(ptr %p, <3 x i16> %a, i16 %b) { } define void @insertelement_v4i16(ptr %p, <4 x i16> %a, i16 %b) { -; CHECK-LABEL: @insertelement_v4i16( -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i16, ptr [[P]], i32 2 -; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i16, ptr [[P]], i32 3 -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 -; CHECK-NEXT: store i16 [[A_I0]], ptr [[P]], align 8 -; CHECK-NEXT: store i16 [[A_I1]], ptr [[P_I1]], align 2 -; CHECK-NEXT: store i16 [[A_I2]], ptr [[P_I2]], align 4 -; CHECK-NEXT: store i16 [[B:%.*]], ptr [[P_I3]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @insertelement_v4i16( +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr i16, ptr [[P]], i32 2 +; MIN16-NEXT: [[P_I3:%.*]] = getelementptr i16, ptr [[P]], i32 3 +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x i16> [[A:%.*]], i64 0 +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x i16> [[A]], i64 1 +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x i16> [[A]], i64 2 +; MIN16-NEXT: store i16 [[A_I0]], ptr [[P]], align 8 +; MIN16-NEXT: store i16 [[A_I1]], ptr [[P_I1]], align 2 +; MIN16-NEXT: store i16 [[A_I2]], ptr [[P_I2]], align 4 +; MIN16-NEXT: store i16 [[B:%.*]], ptr [[P_I3]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @insertelement_v4i16( +; MIN32-NEXT: [[P_I1:%.*]] = getelementptr <2 x i16>, ptr [[P:%.*]], i32 1 +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x i16> [[A:%.*]], <4 x i16> poison, <2 x i32> +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <2 x i32> +; MIN32-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[A_I1]], i16 [[B:%.*]], i64 1 +; MIN32-NEXT: store <2 x i16> [[A_I0]], ptr [[P]], align 8 +; MIN32-NEXT: store <2 x i16> [[TMP1]], ptr [[P_I1]], align 4 +; MIN32-NEXT: ret void ; %r = insertelement <4 x i16> %a, i16 %b, i64 3 store <4 x i16> %r, ptr %p @@ -502,11 +667,16 @@ define void @insertelement_v4i16(ptr %p, <4 x i16> %a, i16 %b) { } define <2 x i16> @load_insertelement_v2i16(ptr %pa, i16 %b) { -; CHECK-LABEL: @load_insertelement_v2i16( -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 4 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x i16> poison, i16 [[A_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i16> [[R_UPTO0]], i16 [[B:%.*]], i64 1 -; CHECK-NEXT: ret <2 x i16> [[R]] +; MIN16-LABEL: @load_insertelement_v2i16( +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 4 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x i16> poison, i16 [[A_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x i16> [[R_UPTO0]], i16 [[B:%.*]], i64 1 +; MIN16-NEXT: ret <2 x i16> [[R]] +; +; MIN32-LABEL: @load_insertelement_v2i16( +; MIN32-NEXT: [[A:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 4 +; MIN32-NEXT: [[R:%.*]] = insertelement <2 x i16> [[A]], i16 [[B:%.*]], i64 1 +; MIN32-NEXT: ret <2 x i16> [[R]] ; %a = load <2 x i16>, ptr %pa %r = insertelement <2 x i16> %a, i16 %b, i64 1 @@ -514,14 +684,20 @@ define <2 x i16> @load_insertelement_v2i16(ptr %pa, i16 %b) { } define <3 x i16> @load_insertelement_v3i16(ptr %pa, i16 %b) { -; CHECK-LABEL: @load_insertelement_v3i16( -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x i16> poison, i16 [[A_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x i16> [[R_UPTO0]], i16 [[A_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x i16> [[R_UPTO1]], i16 [[B:%.*]], i64 2 -; CHECK-NEXT: ret <3 x i16> [[R]] +; MIN16-LABEL: @load_insertelement_v3i16( +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x i16> poison, i16 [[A_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x i16> [[R_UPTO0]], i16 [[A_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x i16> [[R_UPTO1]], i16 [[B:%.*]], i64 2 +; MIN16-NEXT: ret <3 x i16> [[R]] +; +; MIN32-LABEL: @load_insertelement_v3i16( +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[A_I0]], <2 x i16> [[A_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x i16> [[TMP1]], i16 [[B:%.*]], i64 2 +; MIN32-NEXT: ret <3 x i16> [[R]] ; %a = load <3 x i16>, ptr %pa %r = insertelement <3 x i16> %a, i16 %b, i64 2 @@ -529,17 +705,27 @@ define <3 x i16> @load_insertelement_v3i16(ptr %pa, i16 %b) { } define <4 x i16> @load_insertelement_v4i16(ptr %pa, i16 %b) { -; CHECK-LABEL: @load_insertelement_v4i16( -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x i16> poison, i16 [[A_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x i16> [[R_UPTO0]], i16 [[A_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x i16> [[R_UPTO1]], i16 [[A_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i16> [[R_UPTO2]], i16 [[B:%.*]], i64 3 -; CHECK-NEXT: ret <4 x i16> [[R]] +; MIN16-LABEL: @load_insertelement_v4i16( +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA:%.*]], align 8 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x i16> poison, i16 [[A_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x i16> [[R_UPTO0]], i16 [[A_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x i16> [[R_UPTO1]], i16 [[A_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x i16> [[R_UPTO2]], i16 [[B:%.*]], i64 3 +; MIN16-NEXT: ret <4 x i16> [[R]] +; +; MIN32-LABEL: @load_insertelement_v4i16( +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[PA_I1:%.*]] = getelementptr <2 x i16>, ptr [[PA]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load <2 x i16>, ptr [[PA_I1]], align 4 +; MIN32-NEXT: [[TMP1:%.*]] = insertelement <2 x i16> [[A_I1]], i16 [[B:%.*]], i64 1 +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[A_I0]], <2 x i16> [[A_I0]], <4 x i32> +; MIN32-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> [[TMP1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> [[TMP3]], <4 x i32> +; MIN32-NEXT: ret <4 x i16> [[R]] ; %a = load <4 x i16>, ptr %pa %r = insertelement <4 x i16> %a, i16 %b, i64 3 @@ -547,21 +733,31 @@ define <4 x i16> @load_insertelement_v4i16(ptr %pa, i16 %b) { } define void @shufflevector_grow(ptr %pa, ptr %pb) { -; CHECK-LABEL: @shufflevector_grow( -; CHECK-NEXT: [[PA_I11:%.*]] = getelementptr i16, ptr [[PA:%.*]], i32 1 -; CHECK-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 -; CHECK-NEXT: [[PA_I3:%.*]] = getelementptr i16, ptr [[PA]], i32 3 -; CHECK-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 -; CHECK-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA]], align 4 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 4 -; CHECK-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 -; CHECK-NEXT: store i16 [[A_I0]], ptr [[PA]], align 8 -; CHECK-NEXT: store i16 [[A_I1]], ptr [[PA_I11]], align 2 -; CHECK-NEXT: store i16 [[B_I0]], ptr [[PA_I2]], align 4 -; CHECK-NEXT: store i16 [[B_I1]], ptr [[PA_I3]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @shufflevector_grow( +; MIN16-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA:%.*]], i32 2 +; MIN16-NEXT: [[PA_I3:%.*]] = getelementptr i16, ptr [[PA]], i32 3 +; MIN16-NEXT: [[PB_I1:%.*]] = getelementptr i16, ptr [[PB:%.*]], i32 1 +; MIN16-NEXT: [[A_I0:%.*]] = load i16, ptr [[PA]], align 4 +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[B_I0:%.*]] = load i16, ptr [[PB]], align 4 +; MIN16-NEXT: [[B_I1:%.*]] = load i16, ptr [[PB_I1]], align 2 +; MIN16-NEXT: store i16 [[A_I0]], ptr [[PA]], align 8 +; MIN16-NEXT: store i16 [[A_I1]], ptr [[PA_I1]], align 2 +; MIN16-NEXT: store i16 [[B_I0]], ptr [[PA_I2]], align 4 +; MIN16-NEXT: store i16 [[B_I1]], ptr [[PA_I3]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @shufflevector_grow( +; MIN32-NEXT: [[PA_I1:%.*]] = getelementptr <2 x i16>, ptr [[PA:%.*]], i32 1 +; MIN32-NEXT: [[A:%.*]] = load <2 x i16>, ptr [[PA]], align 4 +; MIN32-NEXT: [[B:%.*]] = load <2 x i16>, ptr [[PB:%.*]], align 4 +; MIN32-NEXT: [[R:%.*]] = shufflevector <2 x i16> [[A]], <2 x i16> [[B]], <4 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> poison, <2 x i32> +; MIN32-NEXT: store <2 x i16> [[R_I0]], ptr [[PA]], align 8 +; MIN32-NEXT: [[R_I1:%.*]] = shufflevector <4 x i16> [[R]], <4 x i16> poison, <2 x i32> +; MIN32-NEXT: store <2 x i16> [[R_I1]], ptr [[PA_I1]], align 4 +; MIN32-NEXT: ret void ; %a = load <2 x i16>, ptr %pa %b = load <2 x i16>, ptr %pb @@ -571,15 +767,25 @@ define void @shufflevector_grow(ptr %pa, ptr %pb) { } define void @shufflevector_shrink(ptr %pa) { -; CHECK-LABEL: @shufflevector_shrink( -; CHECK-NEXT: [[PA_I11:%.*]] = getelementptr i16, ptr [[PA:%.*]], i32 1 -; CHECK-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 -; CHECK-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 -; CHECK-NEXT: store i16 [[A_I1]], ptr [[PA]], align 4 -; CHECK-NEXT: store i16 [[A_I2]], ptr [[PA_I11]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @shufflevector_shrink( +; MIN16-NEXT: [[PA_I1:%.*]] = getelementptr i16, ptr [[PA:%.*]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load i16, ptr [[PA_I1]], align 2 +; MIN16-NEXT: [[PA_I2:%.*]] = getelementptr i16, ptr [[PA]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load i16, ptr [[PA_I2]], align 4 +; MIN16-NEXT: store i16 [[A_I1]], ptr [[PA]], align 4 +; MIN16-NEXT: store i16 [[A_I2]], ptr [[PA_I1]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @shufflevector_shrink( +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x i16>, ptr [[PA:%.*]], align 8 +; MIN32-NEXT: [[PA_I1:%.*]] = getelementptr <2 x i16>, ptr [[PA]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load <2 x i16>, ptr [[PA_I1]], align 4 +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[A_I0]], <2 x i16> [[A_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x i16> [[A_I1]], <2 x i16> [[A_I1]], <4 x i32> +; MIN32-NEXT: [[A:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP2]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x i16> [[A]], <4 x i16> poison, <2 x i32> +; MIN32-NEXT: store <2 x i16> [[R]], ptr [[PA]], align 4 +; MIN32-NEXT: ret void ; %a = load <4 x i16>, ptr %pa %r = shufflevector <4 x i16> %a, <4 x i16> poison, <2 x i32> @@ -588,27 +794,43 @@ define void @shufflevector_shrink(ptr %pa) { } define void @phi_v2f16(ptr %base, i64 %bound) { -; CHECK-LABEL: @phi_v2f16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[P:%.*]] = getelementptr <2 x half>, ptr [[BASE]], i64 [[IDX]] -; CHECK-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 -; CHECK-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] -; CHECK-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] -; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 -; CHECK-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] -; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] -; CHECK: end: -; CHECK-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 4 -; CHECK-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @phi_v2f16( +; MIN16-NEXT: entry: +; MIN16-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 +; MIN16-NEXT: br label [[LOOP:%.*]] +; MIN16: loop: +; MIN16-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[P:%.*]] = getelementptr <2 x half>, ptr [[BASE]], i64 [[IDX]] +; MIN16-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 +; MIN16-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] +; MIN16-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] +; MIN16-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN16-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN16-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN16: end: +; MIN16-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 4 +; MIN16-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @phi_v2f16( +; MIN32-NEXT: entry: +; MIN32-NEXT: br label [[LOOP:%.*]] +; MIN32: loop: +; MIN32-NEXT: [[X:%.*]] = phi <2 x half> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[X_NEXT:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[P:%.*]] = getelementptr <2 x half>, ptr [[BASE:%.*]], i64 [[IDX]] +; MIN32-NEXT: [[A:%.*]] = load <2 x half>, ptr [[P]], align 2 +; MIN32-NEXT: [[X_NEXT]] = fadd <2 x half> [[X]], [[A]] +; MIN32-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN32-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN32-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN32: end: +; MIN32-NEXT: store <2 x half> [[X_NEXT]], ptr [[BASE]], align 4 +; MIN32-NEXT: ret void ; entry: br label %loop @@ -629,33 +851,55 @@ end: } define void @phi_v3f16(ptr %base, i64 %bound) { -; CHECK-LABEL: @phi_v3f16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 -; CHECK-NEXT: [[BASE_I2:%.*]] = getelementptr half, ptr [[BASE]], i32 2 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I2:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I2:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[P:%.*]] = getelementptr <3 x half>, ptr [[BASE]], i64 [[IDX]] -; CHECK-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr half, ptr [[P]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load half, ptr [[P_I2]], align 2 -; CHECK-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] -; CHECK-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] -; CHECK-NEXT: [[X_NEXT_I2]] = fadd half [[X_I2]], [[A_I2]] -; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 -; CHECK-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] -; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] -; CHECK: end: -; CHECK-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 8 -; CHECK-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 -; CHECK-NEXT: store half [[X_NEXT_I2]], ptr [[BASE_I2]], align 4 -; CHECK-NEXT: ret void +; MIN16-LABEL: @phi_v3f16( +; MIN16-NEXT: entry: +; MIN16-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 +; MIN16-NEXT: [[BASE_I2:%.*]] = getelementptr half, ptr [[BASE]], i32 2 +; MIN16-NEXT: br label [[LOOP:%.*]] +; MIN16: loop: +; MIN16-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I2:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I2:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[P:%.*]] = getelementptr <3 x half>, ptr [[BASE]], i64 [[IDX]] +; MIN16-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr half, ptr [[P]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load half, ptr [[P_I2]], align 2 +; MIN16-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] +; MIN16-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] +; MIN16-NEXT: [[X_NEXT_I2]] = fadd half [[X_I2]], [[A_I2]] +; MIN16-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN16-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN16-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN16: end: +; MIN16-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 8 +; MIN16-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 +; MIN16-NEXT: store half [[X_NEXT_I2]], ptr [[BASE_I2]], align 4 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @phi_v3f16( +; MIN32-NEXT: entry: +; MIN32-NEXT: [[BASE_I1:%.*]] = getelementptr <2 x half>, ptr [[BASE:%.*]], i32 1 +; MIN32-NEXT: br label [[LOOP:%.*]] +; MIN32: loop: +; MIN32-NEXT: [[X_I0:%.*]] = phi <2 x half> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[P:%.*]] = getelementptr <3 x half>, ptr [[BASE]], i64 [[IDX]] +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x half>, ptr [[P]], align 2 +; MIN32-NEXT: [[P_I1:%.*]] = getelementptr <2 x half>, ptr [[P]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 +; MIN32-NEXT: [[X_NEXT_I0]] = fadd <2 x half> [[X_I0]], [[A_I0]] +; MIN32-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] +; MIN32-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN32-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN32-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN32: end: +; MIN32-NEXT: store <2 x half> [[X_NEXT_I0]], ptr [[BASE]], align 8 +; MIN32-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 4 +; MIN32-NEXT: ret void ; entry: br label %loop @@ -676,39 +920,61 @@ end: } define void @phi_v4f16(ptr %base, i64 %bound) { -; CHECK-LABEL: @phi_v4f16( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 -; CHECK-NEXT: [[BASE_I2:%.*]] = getelementptr half, ptr [[BASE]], i32 2 -; CHECK-NEXT: [[BASE_I3:%.*]] = getelementptr half, ptr [[BASE]], i32 3 -; CHECK-NEXT: br label [[LOOP:%.*]] -; CHECK: loop: -; CHECK-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I2:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I2:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[X_I3:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I3:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[P:%.*]] = getelementptr <4 x half>, ptr [[BASE]], i64 [[IDX]] -; CHECK-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 -; CHECK-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 -; CHECK-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 -; CHECK-NEXT: [[P_I2:%.*]] = getelementptr half, ptr [[P]], i32 2 -; CHECK-NEXT: [[A_I2:%.*]] = load half, ptr [[P_I2]], align 2 -; CHECK-NEXT: [[P_I3:%.*]] = getelementptr half, ptr [[P]], i32 3 -; CHECK-NEXT: [[A_I3:%.*]] = load half, ptr [[P_I3]], align 2 -; CHECK-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] -; CHECK-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] -; CHECK-NEXT: [[X_NEXT_I2]] = fadd half [[X_I2]], [[A_I2]] -; CHECK-NEXT: [[X_NEXT_I3]] = fadd half [[X_I3]], [[A_I3]] -; CHECK-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 -; CHECK-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] -; CHECK-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] -; CHECK: end: -; CHECK-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 8 -; CHECK-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 -; CHECK-NEXT: store half [[X_NEXT_I2]], ptr [[BASE_I2]], align 4 -; CHECK-NEXT: store half [[X_NEXT_I3]], ptr [[BASE_I3]], align 2 -; CHECK-NEXT: ret void +; MIN16-LABEL: @phi_v4f16( +; MIN16-NEXT: entry: +; MIN16-NEXT: [[BASE_I1:%.*]] = getelementptr half, ptr [[BASE:%.*]], i32 1 +; MIN16-NEXT: [[BASE_I2:%.*]] = getelementptr half, ptr [[BASE]], i32 2 +; MIN16-NEXT: [[BASE_I3:%.*]] = getelementptr half, ptr [[BASE]], i32 3 +; MIN16-NEXT: br label [[LOOP:%.*]] +; MIN16: loop: +; MIN16-NEXT: [[X_I0:%.*]] = phi half [ 0xH0000, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I1:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I2:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I2:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[X_I3:%.*]] = phi half [ 0xH0000, [[ENTRY]] ], [ [[X_NEXT_I3:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN16-NEXT: [[P:%.*]] = getelementptr <4 x half>, ptr [[BASE]], i64 [[IDX]] +; MIN16-NEXT: [[A_I0:%.*]] = load half, ptr [[P]], align 2 +; MIN16-NEXT: [[P_I1:%.*]] = getelementptr half, ptr [[P]], i32 1 +; MIN16-NEXT: [[A_I1:%.*]] = load half, ptr [[P_I1]], align 2 +; MIN16-NEXT: [[P_I2:%.*]] = getelementptr half, ptr [[P]], i32 2 +; MIN16-NEXT: [[A_I2:%.*]] = load half, ptr [[P_I2]], align 2 +; MIN16-NEXT: [[P_I3:%.*]] = getelementptr half, ptr [[P]], i32 3 +; MIN16-NEXT: [[A_I3:%.*]] = load half, ptr [[P_I3]], align 2 +; MIN16-NEXT: [[X_NEXT_I0]] = fadd half [[X_I0]], [[A_I0]] +; MIN16-NEXT: [[X_NEXT_I1]] = fadd half [[X_I1]], [[A_I1]] +; MIN16-NEXT: [[X_NEXT_I2]] = fadd half [[X_I2]], [[A_I2]] +; MIN16-NEXT: [[X_NEXT_I3]] = fadd half [[X_I3]], [[A_I3]] +; MIN16-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN16-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN16-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN16: end: +; MIN16-NEXT: store half [[X_NEXT_I0]], ptr [[BASE]], align 8 +; MIN16-NEXT: store half [[X_NEXT_I1]], ptr [[BASE_I1]], align 2 +; MIN16-NEXT: store half [[X_NEXT_I2]], ptr [[BASE_I2]], align 4 +; MIN16-NEXT: store half [[X_NEXT_I3]], ptr [[BASE_I3]], align 2 +; MIN16-NEXT: ret void +; +; MIN32-LABEL: @phi_v4f16( +; MIN32-NEXT: entry: +; MIN32-NEXT: [[BASE_I1:%.*]] = getelementptr <2 x half>, ptr [[BASE:%.*]], i32 1 +; MIN32-NEXT: br label [[LOOP:%.*]] +; MIN32: loop: +; MIN32-NEXT: [[X_I0:%.*]] = phi <2 x half> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[X_NEXT_I0:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[X_I1:%.*]] = phi <2 x half> [ zeroinitializer, [[ENTRY]] ], [ [[X_NEXT_I1:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[IDX:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IDX_NEXT:%.*]], [[LOOP]] ] +; MIN32-NEXT: [[P:%.*]] = getelementptr <4 x half>, ptr [[BASE]], i64 [[IDX]] +; MIN32-NEXT: [[A_I0:%.*]] = load <2 x half>, ptr [[P]], align 2 +; MIN32-NEXT: [[P_I1:%.*]] = getelementptr <2 x half>, ptr [[P]], i32 1 +; MIN32-NEXT: [[A_I1:%.*]] = load <2 x half>, ptr [[P_I1]], align 2 +; MIN32-NEXT: [[X_NEXT_I0]] = fadd <2 x half> [[X_I0]], [[A_I0]] +; MIN32-NEXT: [[X_NEXT_I1]] = fadd <2 x half> [[X_I1]], [[A_I1]] +; MIN32-NEXT: [[IDX_NEXT]] = add i64 [[IDX]], 1 +; MIN32-NEXT: [[CC:%.*]] = icmp ult i64 [[IDX_NEXT]], [[BOUND:%.*]] +; MIN32-NEXT: br i1 [[CC]], label [[LOOP]], label [[END:%.*]] +; MIN32: end: +; MIN32-NEXT: store <2 x half> [[X_NEXT_I0]], ptr [[BASE]], align 8 +; MIN32-NEXT: store <2 x half> [[X_NEXT_I1]], ptr [[BASE_I1]], align 4 +; MIN32-NEXT: ret void ; entry: br label %loop @@ -729,60 +995,87 @@ end: } define <2 x half> @call_v2f16(<2 x half> %a, <2 x half> %b) { -; CHECK-LABEL: @call_v2f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: ret <2 x half> [[R]] +; MIN16-LABEL: @call_v2f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <2 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <2 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <2 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <2 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <2 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R:%.*]] = insertelement <2 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: ret <2 x half> [[R]] +; +; MIN32-LABEL: @call_v2f16( +; MIN32-NEXT: [[R:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[A:%.*]], <2 x half> [[B:%.*]]) +; MIN32-NEXT: ret <2 x half> [[R]] ; %r = call <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b) ret <2 x half> %r } define <3 x half> @call_v3f16(<3 x half> %a, <3 x half> %b) { -; CHECK-LABEL: @call_v3f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = call half @llvm.minnum.f16(half [[A_I2]], half [[B_I2]]) -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: ret <3 x half> [[R]] +; MIN16-LABEL: @call_v3f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <3 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <3 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = call half @llvm.minnum.f16(half [[A_I2]], half [[B_I2]]) +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <3 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <3 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R:%.*]] = insertelement <3 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: ret <3 x half> [[R]] +; +; MIN32-LABEL: @call_v3f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <3 x half> [[A:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <3 x half> [[B:%.*]], <3 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[A_I0]], <2 x half> [[B_I0]]) +; MIN32-NEXT: [[A_I1:%.*]] = extractelement <3 x half> [[A]], i64 2 +; MIN32-NEXT: [[B_I1:%.*]] = extractelement <3 x half> [[B]], i64 2 +; MIN32-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <3 x i32> +; MIN32-NEXT: [[R:%.*]] = insertelement <3 x half> [[TMP1]], half [[R_I1]], i64 2 +; MIN32-NEXT: ret <3 x half> [[R]] ; %r = call <3 x half> @llvm.minnum.v3f16(<3 x half> %a, <3 x half> %b) ret <3 x half> %r } define <4 x half> @call_v4f16(<4 x half> %a, <4 x half> %b) { -; CHECK-LABEL: @call_v4f16( -; CHECK-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 -; CHECK-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 -; CHECK-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) -; CHECK-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 -; CHECK-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 -; CHECK-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) -; CHECK-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 -; CHECK-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 -; CHECK-NEXT: [[R_I2:%.*]] = call half @llvm.minnum.f16(half [[A_I2]], half [[B_I2]]) -; CHECK-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 -; CHECK-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 -; CHECK-NEXT: [[R_I3:%.*]] = call half @llvm.minnum.f16(half [[A_I3]], half [[B_I3]]) -; CHECK-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 -; CHECK-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 -; CHECK-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 -; CHECK-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 -; CHECK-NEXT: ret <4 x half> [[R]] +; MIN16-LABEL: @call_v4f16( +; MIN16-NEXT: [[A_I0:%.*]] = extractelement <4 x half> [[A:%.*]], i64 0 +; MIN16-NEXT: [[B_I0:%.*]] = extractelement <4 x half> [[B:%.*]], i64 0 +; MIN16-NEXT: [[R_I0:%.*]] = call half @llvm.minnum.f16(half [[A_I0]], half [[B_I0]]) +; MIN16-NEXT: [[A_I1:%.*]] = extractelement <4 x half> [[A]], i64 1 +; MIN16-NEXT: [[B_I1:%.*]] = extractelement <4 x half> [[B]], i64 1 +; MIN16-NEXT: [[R_I1:%.*]] = call half @llvm.minnum.f16(half [[A_I1]], half [[B_I1]]) +; MIN16-NEXT: [[A_I2:%.*]] = extractelement <4 x half> [[A]], i64 2 +; MIN16-NEXT: [[B_I2:%.*]] = extractelement <4 x half> [[B]], i64 2 +; MIN16-NEXT: [[R_I2:%.*]] = call half @llvm.minnum.f16(half [[A_I2]], half [[B_I2]]) +; MIN16-NEXT: [[A_I3:%.*]] = extractelement <4 x half> [[A]], i64 3 +; MIN16-NEXT: [[B_I3:%.*]] = extractelement <4 x half> [[B]], i64 3 +; MIN16-NEXT: [[R_I3:%.*]] = call half @llvm.minnum.f16(half [[A_I3]], half [[B_I3]]) +; MIN16-NEXT: [[R_UPTO0:%.*]] = insertelement <4 x half> poison, half [[R_I0]], i64 0 +; MIN16-NEXT: [[R_UPTO1:%.*]] = insertelement <4 x half> [[R_UPTO0]], half [[R_I1]], i64 1 +; MIN16-NEXT: [[R_UPTO2:%.*]] = insertelement <4 x half> [[R_UPTO1]], half [[R_I2]], i64 2 +; MIN16-NEXT: [[R:%.*]] = insertelement <4 x half> [[R_UPTO2]], half [[R_I3]], i64 3 +; MIN16-NEXT: ret <4 x half> [[R]] +; +; MIN32-LABEL: @call_v4f16( +; MIN32-NEXT: [[A_I0:%.*]] = shufflevector <4 x half> [[A:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I0:%.*]] = shufflevector <4 x half> [[B:%.*]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I0:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[A_I0]], <2 x half> [[B_I0]]) +; MIN32-NEXT: [[A_I1:%.*]] = shufflevector <4 x half> [[A]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[B_I1:%.*]] = shufflevector <4 x half> [[B]], <4 x half> poison, <2 x i32> +; MIN32-NEXT: [[R_I1:%.*]] = call <2 x half> @llvm.minnum.v2f16(<2 x half> [[A_I1]], <2 x half> [[B_I1]]) +; MIN32-NEXT: [[TMP1:%.*]] = shufflevector <2 x half> [[R_I0]], <2 x half> [[R_I0]], <4 x i32> +; MIN32-NEXT: [[TMP2:%.*]] = shufflevector <2 x half> [[R_I1]], <2 x half> [[R_I1]], <4 x i32> +; MIN32-NEXT: [[R:%.*]] = shufflevector <4 x half> [[TMP1]], <4 x half> [[TMP2]], <4 x i32> +; MIN32-NEXT: ret <4 x half> [[R]] ; %r = call <4 x half> @llvm.minnum.v4f16(<4 x half> %a, <4 x half> %b) ret <4 x half> %r diff --git a/llvm/test/Transforms/Scalarizer/opaque-ptr-bug.ll b/llvm/test/Transforms/Scalarizer/opaque-ptr-bug.ll index 81737a1849154..7d0ba0aa45f05 100644 --- a/llvm/test/Transforms/Scalarizer/opaque-ptr-bug.ll +++ b/llvm/test/Transforms/Scalarizer/opaque-ptr-bug.ll @@ -6,19 +6,18 @@ define void @test1(ptr %p) { ; CHECK-LABEL: @test1( -; CHECK-NEXT: [[P_I12:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 -; CHECK-NEXT: [[P_I11:%.*]] = getelementptr i32, ptr [[P]], i32 1 +; CHECK-NEXT: [[P_I11:%.*]] = getelementptr i16, ptr [[P:%.*]], i32 1 ; CHECK-NEXT: [[P_I2:%.*]] = getelementptr i32, ptr [[P]], i32 2 ; CHECK-NEXT: [[P_I3:%.*]] = getelementptr i32, ptr [[P]], i32 3 ; CHECK-NEXT: store i32 0, ptr [[P]], align 8 ; CHECK-NEXT: [[P_I1:%.*]] = getelementptr i32, ptr [[P]], i32 1 ; CHECK-NEXT: store i32 0, ptr [[P_I1]], align 4 ; CHECK-NEXT: store i32 0, ptr [[P]], align 16 -; CHECK-NEXT: store i32 0, ptr [[P_I11]], align 4 +; CHECK-NEXT: store i32 0, ptr [[P_I1]], align 4 ; CHECK-NEXT: store i32 0, ptr [[P_I2]], align 8 ; CHECK-NEXT: store i32 0, ptr [[P_I3]], align 4 ; CHECK-NEXT: store i16 0, ptr [[P]], align 4 -; CHECK-NEXT: store i16 0, ptr [[P_I12]], align 2 +; CHECK-NEXT: store i16 0, ptr [[P_I11]], align 2 ; CHECK-NEXT: ret void ; store <2 x i32> zeroinitializer, ptr %p diff --git a/llvm/test/Transforms/Scalarizer/vector-gep.ll b/llvm/test/Transforms/Scalarizer/vector-gep.ll index c93c95e7f398f..3d8f1997dc383 100644 --- a/llvm/test/Transforms/Scalarizer/vector-gep.ll +++ b/llvm/test/Transforms/Scalarizer/vector-gep.ll @@ -40,16 +40,10 @@ define void @test2() { ; CHECK-NEXT: [[DOTI2:%.*]] = extractelement <4 x ptr> [[TMP0]], i64 2 ; CHECK-NEXT: [[DOTI3:%.*]] = extractelement <4 x ptr> [[TMP0]], i64 3 ; CHECK-NEXT: [[INDEX:%.*]] = load i16, ptr @index, align 2 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[INDEX]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x i16> [[DOTSPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT_I0:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 0 -; CHECK-NEXT: [[DOTI01:%.*]] = getelementptr i16, ptr [[DOTI0]], i16 [[DOTSPLAT_I0]] -; CHECK-NEXT: [[DOTSPLAT_I1:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 1 -; CHECK-NEXT: [[DOTI12:%.*]] = getelementptr i16, ptr [[DOTI1]], i16 [[DOTSPLAT_I1]] -; CHECK-NEXT: [[DOTSPLAT_I2:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 2 -; CHECK-NEXT: [[DOTI23:%.*]] = getelementptr i16, ptr [[DOTI2]], i16 [[DOTSPLAT_I2]] -; CHECK-NEXT: [[DOTSPLAT_I3:%.*]] = extractelement <4 x i16> [[DOTSPLAT]], i64 3 -; CHECK-NEXT: [[DOTI34:%.*]] = getelementptr i16, ptr [[DOTI3]], i16 [[DOTSPLAT_I3]] +; CHECK-NEXT: [[DOTI01:%.*]] = getelementptr i16, ptr [[DOTI0]], i16 [[INDEX]] +; CHECK-NEXT: [[DOTI12:%.*]] = getelementptr i16, ptr [[DOTI1]], i16 [[INDEX]] +; CHECK-NEXT: [[DOTI23:%.*]] = getelementptr i16, ptr [[DOTI2]], i16 [[INDEX]] +; CHECK-NEXT: [[DOTI34:%.*]] = getelementptr i16, ptr [[DOTI3]], i16 [[INDEX]] ; CHECK-NEXT: ret void ; bb: @@ -77,16 +71,10 @@ define <4 x ptr> @test3_constbase(i16 %idx) { ; CHECK-LABEL: @test3_constbase( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[OFFSET:%.*]] = getelementptr [4 x i16], ptr @ptr, i16 0, i16 [[IDX:%.*]] -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[OFFSET]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT_I0:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 0 -; CHECK-NEXT: [[GEP_I0:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I0]], i16 0 -; CHECK-NEXT: [[DOTSPLAT_I1:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 1 -; CHECK-NEXT: [[GEP_I1:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I1]], i16 1 -; CHECK-NEXT: [[DOTSPLAT_I2:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 2 -; CHECK-NEXT: [[GEP_I2:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I2]], i16 2 -; CHECK-NEXT: [[DOTSPLAT_I3:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 3 -; CHECK-NEXT: [[GEP_I3:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I3]], i16 3 +; CHECK-NEXT: [[GEP_I0:%.*]] = getelementptr i16, ptr [[OFFSET]], i16 0 +; CHECK-NEXT: [[GEP_I1:%.*]] = getelementptr i16, ptr [[OFFSET]], i16 1 +; CHECK-NEXT: [[GEP_I2:%.*]] = getelementptr i16, ptr [[OFFSET]], i16 2 +; CHECK-NEXT: [[GEP_I3:%.*]] = getelementptr i16, ptr [[OFFSET]], i16 3 ; CHECK-NEXT: [[GEP_UPTO0:%.*]] = insertelement <4 x ptr> poison, ptr [[GEP_I0]], i64 0 ; CHECK-NEXT: [[GEP_UPTO1:%.*]] = insertelement <4 x ptr> [[GEP_UPTO0]], ptr [[GEP_I1]], i64 1 ; CHECK-NEXT: [[GEP_UPTO2:%.*]] = insertelement <4 x ptr> [[GEP_UPTO1]], ptr [[GEP_I2]], i64 2 @@ -127,16 +115,10 @@ define void @test4() { ; CHECK-LABEL: @test4( ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr @ptrptr, align 8 -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLAT_I0:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 0 -; CHECK-NEXT: [[DOTI0:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I0]], i16 0 -; CHECK-NEXT: [[DOTSPLAT_I1:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 1 -; CHECK-NEXT: [[DOTI1:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I1]], i16 1 -; CHECK-NEXT: [[DOTSPLAT_I2:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 2 -; CHECK-NEXT: [[DOTI2:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I2]], i16 2 -; CHECK-NEXT: [[DOTSPLAT_I3:%.*]] = extractelement <4 x ptr> [[DOTSPLAT]], i64 3 -; CHECK-NEXT: [[DOTI3:%.*]] = getelementptr i16, ptr [[DOTSPLAT_I3]], i16 3 +; CHECK-NEXT: [[DOTI0:%.*]] = getelementptr i16, ptr [[TMP0]], i16 0 +; CHECK-NEXT: [[DOTI1:%.*]] = getelementptr i16, ptr [[TMP0]], i16 1 +; CHECK-NEXT: [[DOTI2:%.*]] = getelementptr i16, ptr [[TMP0]], i16 2 +; CHECK-NEXT: [[DOTI3:%.*]] = getelementptr i16, ptr [[TMP0]], i16 3 ; CHECK-NEXT: ret void ; bb: