diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index 7d017095c88ce..a0b47bf2e8a29 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -12,17 +12,16 @@ // effective in simplifying arithmetic statements derived from an unrolled loop. // It can also simplify the logic of SeparateConstOffsetFromGEP. // -// There are many optimizations we can perform in the domain of SLSR. This file -// for now contains only an initial step. Specifically, we look for strength -// reduction candidates in the following forms: +// There are many optimizations we can perform in the domain of SLSR. +// We look for strength reduction candidates in the following forms: // -// Form 1: B + i * S -// Form 2: (B + i) * S -// Form 3: &B[i * S] +// Form Add: B + i * S +// Form Mul: (B + i) * S +// Form GEP: &B[i * S] // // where S is an integer variable, and i is a constant integer. If we found two // candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2 -// in a simpler way with respect to S1. For example, +// in a simpler way with respect to S1 (index delta). For example, // // S1: X = B + i * S // S2: Y = B + i' * S => X + (i' - i) * S @@ -35,8 +34,29 @@ // // Note: (i' - i) * S is folded to the extent possible. // +// For Add and GEP forms, we can also rewrite a candidate in a simpler way +// with respect to other dominating candidates if their B or S are different +// but other parts are the same. For example, +// +// Base Delta: +// S1: X = B + i * S +// S2: Y = B' + i * S => X + (B' - B) +// +// S1: X = &B [i * S] +// S2: Y = &B'[i * S] => X + (B' - B) +// +// Stride Delta: +// S1: X = B + i * S +// S2: Y = B + i * S' => X + i * (S' - S) +// +// S1: X = &B[i * S] +// S2: Y = &B[i * S'] => X + i * (S' - S) +// +// PS: Stride delta rewrite on Mul form is usually non-profitable, and Base +// delta rewrite sometimes is profitable, so we do not support them on Mul. +// // This rewriting is in general a good idea. The code patterns we focus on -// usually come from loop unrolling, so (i' - i) * S is likely the same +// usually come from loop unrolling, so the delta is likely the same // across iterations and can be reused. When that happens, the optimized form // takes only one add starting from the second iteration. // @@ -47,19 +67,14 @@ // TODO: // // - Floating point arithmetics when fast math is enabled. -// -// - SLSR may decrease ILP at the architecture level. Targets that are very -// sensitive to ILP may want to disable it. Having SLSR to consider ILP is -// left as future work. -// -// - When (i' - i) is constant but i and i' are not, we could still perform -// SLSR. #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -86,16 +101,19 @@ #include #include #include +#include #include using namespace llvm; using namespace PatternMatch; +#define DEBUG_TYPE "slsr" + static const unsigned UnknownAddressSpace = std::numeric_limits::max(); DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter", - "Controls whether rewriteCandidateWithBasis is executed."); + "Controls whether rewriteCandidate is executed."); namespace { @@ -142,15 +160,23 @@ class StraightLineStrengthReduce { GEP, // &B[..][i * S][..] }; + enum DKind { + InvalidDelta, // reserved for the default constructor + IndexDelta, // Delta is a constant from Index + BaseDelta, // Delta is a constant or variable from Base + StrideDelta, // Delta is a constant or variable from Stride + }; + Candidate() = default; Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, - Instruction *I) - : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {} + Instruction *I, const SCEV *StrideSCEV) + : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I), + StrideSCEV(StrideSCEV) {} Kind CandidateKind = Invalid; const SCEV *Base = nullptr; - + // TODO: Swap Index and Stride's name. // Note that Index and Stride of a GEP candidate do not necessarily have the // same integer type. In that case, during rewriting, Stride will be // sign-extended or truncated to Index's type. @@ -177,22 +203,154 @@ class StraightLineStrengthReduce { // Points to the immediate basis of this candidate, or nullptr if we cannot // find any basis for this candidate. Candidate *Basis = nullptr; + + DKind DeltaKind = InvalidDelta; + + // Store SCEV of Stride to compute delta from different strides + const SCEV *StrideSCEV = nullptr; + + // Points to (Y - X) that will be used to rewrite this candidate. + Value *Delta = nullptr; + + /// Cost model: Evaluate the computational efficiency of the candidate. + /// + /// Efficiency levels (higher is better): + /// 5 - No instruction: + /// [Variable] or [Const] + /// 4 - One instruction with one variable: + /// [Variable + Const] or [Variable * Const] + /// 3 - One instruction with two variables: + /// [Variable + Variable] or [Variable * Variable] + /// 2 - Two instructions with one variable: + /// [Const + Const * Variable] + /// 1 - Two instructions with two variables: + /// [Variable + Const * Variable] + static unsigned getComputationEfficiency(Kind CandidateKind, + const ConstantInt *Index, + const Value *Stride, + const SCEV *Base = nullptr) { + bool IsConstantBase = false; + bool IsZeroBase = false; + // When evaluating the efficiency of a rewrite, if the Basis's SCEV is + // not available, conservatively assume the base is not constant. + if (auto *ConstBase = dyn_cast_or_null(Base)) { + IsConstantBase = true; + IsZeroBase = ConstBase->getValue()->isZero(); + } + + bool IsConstantStride = isa(Stride); + bool IsZeroStride = + IsConstantStride && cast(Stride)->isZero(); + // All constants + if (IsConstantBase && IsConstantStride) + return 5; + + // [(Base + Index) * Stride] + if (CandidateKind == Mul) { + if (IsZeroStride) + return 5; + if (Index->isZero()) + return (IsConstantStride || IsConstantBase) ? 4 : 3; + + if (IsConstantBase) + return IsZeroBase && (Index->isOne() || Index->isMinusOne()) ? 5 : 4; + + if (IsConstantStride) { + auto *CI = cast(Stride); + return (CI->isOne() || CI->isMinusOne()) ? 4 : 2; + } + return 1; + } + + // Base + Index * Stride + assert(CandidateKind == Add || CandidateKind == GEP); + if (Index->isZero() || IsZeroStride) + return 5; + + bool IsSimpleIndex = Index->isOne() || Index->isMinusOne(); + + if (IsConstantBase) + return IsZeroBase ? (IsSimpleIndex ? 5 : 4) : (IsSimpleIndex ? 4 : 2); + + if (IsConstantStride) + return IsZeroStride ? 5 : 4; + + if (IsSimpleIndex) + return 3; + + return 1; + } + + // Evaluate if the given delta is profitable to rewrite this candidate. + bool isProfitableRewrite(const Value *Delta, const DKind DeltaKind) const { + // This function cannot accurately evaluate the profit of whole expression + // with context. A candidate (B + I * S) cannot express whether this + // instruction needs to compute on its own (I * S), which may be shared + // with other candidates or may need instructions to compute. + // If the rewritten form has the same strength, still rewrite to + // (X + Delta) since it may expose more CSE opportunities on Delta, as + // unrolled loops usually have identical Delta for each unrolled body. + // + // Note, this function should only be used on Index Delta rewrite. + // Base and Stride delta need context info to evaluate the register + // pressure impact from variable delta. + return getComputationEfficiency(CandidateKind, Index, Stride, Base) <= + getRewriteProfit(Delta, DeltaKind); + } + + // Evaluate the rewrite profit of this candidate with its Basis + unsigned getRewriteProfit() const { + return Basis ? getRewriteProfit(Delta, DeltaKind) : 0; + } + + // Evaluate the rewrite profit of this candidate with a given delta + unsigned getRewriteProfit(const Value *Delta, const DKind DeltaKind) const { + switch (DeltaKind) { + case BaseDelta: // [X + Delta] + return getComputationEfficiency( + CandidateKind, + ConstantInt::get(cast(Delta->getType()), 1), Delta); + case StrideDelta: // [X + Index * Delta] + return getComputationEfficiency(CandidateKind, Index, Delta); + case IndexDelta: // [X + Delta * Stride] + return getComputationEfficiency(CandidateKind, cast(Delta), + Stride); + default: + return 0; + } + } + + bool isHighEfficiency() const { + return getComputationEfficiency(CandidateKind, Index, Stride, Base) >= 4; + } + + // Verify that this candidate has valid delta components relative to the + // basis + bool hasValidDelta(const Candidate &Basis) const { + switch (DeltaKind) { + case IndexDelta: + // Index differs, Base and Stride must match + return Base == Basis.Base && StrideSCEV == Basis.StrideSCEV; + case StrideDelta: + // Stride differs, Base and Index must match + return Base == Basis.Base && Index == Basis.Index; + case BaseDelta: + // Base differs, Stride and Index must match + return StrideSCEV == Basis.StrideSCEV && Index == Basis.Index; + default: + return false; + } + } }; bool runOnFunction(Function &F); private: - // Returns true if Basis is a basis for C, i.e., Basis dominates C and they - // share the same base and stride. - bool isBasisFor(const Candidate &Basis, const Candidate &C); - + // Fetch straight-line basis for rewriting C, update C.Basis to point to it, + // and store the delta between C and its Basis in C.Delta. + void setBasisAndDeltaFor(Candidate &C); // Returns whether the candidate can be folded into an addressing mode. - bool isFoldable(const Candidate &C, TargetTransformInfo *TTI, - const DataLayout *DL); - - // Returns true if C is already in a simplest form and not worth being - // rewritten. - bool isSimplestForm(const Candidate &C); + bool isFoldable(const Candidate &C, TargetTransformInfo *TTI); // Checks whether I is in a candidate form. If so, adds all the matching forms // to Candidates, and tries to find the immediate basis for each of them. @@ -216,12 +374,6 @@ class StraightLineStrengthReduce { // Allocate candidates and find bases for GetElementPtr instructions. void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP); - // A helper function that scales Idx with ElementSize before invoking - // allocateCandidatesAndFindBasis. - void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, - Value *S, uint64_t ElementSize, - Instruction *I); - // Adds the given form to Candidates, and finds its immediate // basis. void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B, @@ -229,13 +381,7 @@ class StraightLineStrengthReduce { Instruction *I); // Rewrites candidate C with respect to Basis. - void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); - - // A helper function that factors ArrayIdx to a product of a stride and a - // constant index, and invokes allocateCandidatesAndFindBasis with the - // factorings. - void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, - GetElementPtrInst *GEP); + void rewriteCandidate(const Candidate &C); // Emit code that computes the "bump" from Basis to C. static Value *emitBump(const Candidate &Basis, const Candidate &C, @@ -247,12 +393,203 @@ class StraightLineStrengthReduce { TargetTransformInfo *TTI = nullptr; std::list Candidates; - // Temporarily holds all instructions that are unlinked (but not deleted) by - // rewriteCandidateWithBasis. These instructions will be actually removed - // after all rewriting finishes. - std::vector UnlinkedInstructions; + // Map from SCEV to instructions that represent the value, + // instructions are sorted in depth-first order. + DenseMap> SCEVToInsts; + + // Record the dependency between instructions. If C.Basis == B, we would have + // {B.Ins -> {C.Ins, ...}}. + MapVector> DependencyGraph; + + // Map between each instruction and its possible candidates. + DenseMap> RewriteCandidates; + + // All instructions that have candidates sort in topological order based on + // dependency graph, from roots to leaves. + std::vector SortedCandidateInsts; + + // Record all instructions that are already rewritten and will be removed + // later. + std::vector DeadInstructions; + + // Classify candidates against Delta kind + class CandidateDictTy { + public: + using CandsTy = SmallVector; + using BBToCandsTy = DenseMap; + + private: + // Index delta Basis must have the same (Base, StrideSCEV, Inst.Type) + using IndexDeltaKeyTy = std::tuple; + DenseMap IndexDeltaCandidates; + + // Base delta Basis must have the same (StrideSCEV, Index, Inst.Type) + using BaseDeltaKeyTy = std::tuple; + DenseMap BaseDeltaCandidates; + + // Stride delta Basis must have the same (Base, Index, Inst.Type) + using StrideDeltaKeyTy = std::tuple; + DenseMap StrideDeltaCandidates; + + public: + // TODO: Disable index delta on GEP after we completely move + // from typed GEP to PtrAdd. + const BBToCandsTy *getCandidatesWithDeltaKind(const Candidate &C, + Candidate::DKind K) const { + assert(K != Candidate::InvalidDelta); + if (K == Candidate::IndexDelta) { + IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, C.Ins->getType()); + auto It = IndexDeltaCandidates.find(IndexDeltaKey); + if (It != IndexDeltaCandidates.end()) + return &It->second; + } else if (K == Candidate::BaseDelta) { + BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, C.Ins->getType()); + auto It = BaseDeltaCandidates.find(BaseDeltaKey); + if (It != BaseDeltaCandidates.end()) + return &It->second; + } else { + assert(K == Candidate::StrideDelta); + StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, C.Ins->getType()); + auto It = StrideDeltaCandidates.find(StrideDeltaKey); + if (It != StrideDeltaCandidates.end()) + return &It->second; + } + return nullptr; + } + + // Pointers to C must remain valid until CandidateDict is cleared. + void add(Candidate &C) { + Type *ValueType = C.Ins->getType(); + BasicBlock *BB = C.Ins->getParent(); + IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, ValueType); + BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, ValueType); + StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, ValueType); + IndexDeltaCandidates[IndexDeltaKey][BB].push_back(&C); + BaseDeltaCandidates[BaseDeltaKey][BB].push_back(&C); + StrideDeltaCandidates[StrideDeltaKey][BB].push_back(&C); + } + // Remove all mappings from set + void clear() { + IndexDeltaCandidates.clear(); + BaseDeltaCandidates.clear(); + StrideDeltaCandidates.clear(); + } + } CandidateDict; + + const SCEV *getAndRecordSCEV(Value *V) { + auto *S = SE->getSCEV(V); + if (auto *I = dyn_cast(V)) + if (!isa(S) && !isa(S) && + !isa(S)) + SCEVToInsts[S].insert(I); + + return S; + } + + // Get the nearest instruction before CI that represents the value of S, + // return nullptr if no instruction is associated with S or S is not a + // reusable expression. + Value *getNearestValueOfSCEV(const SCEV *S, const Instruction *CI) const { + if (isa(S)) + return nullptr; + + if (auto *SU = dyn_cast(S)) + return SU->getValue(); + if (auto *SC = dyn_cast(S)) + return SC->getValue(); + + auto It = SCEVToInsts.find(S); + if (It == SCEVToInsts.end()) + return nullptr; + + for (Instruction *I : reverse(It->second)) + if (DT->dominates(I, CI)) + return I; + + return nullptr; + } + + struct DeltaInfo { + Candidate *Cand; + Candidate::DKind DeltaKind; + Value *Delta; + + DeltaInfo() + : Cand(nullptr), DeltaKind(Candidate::InvalidDelta), Delta(nullptr) {} + DeltaInfo(Candidate *Cand, Candidate::DKind DeltaKind, Value *Delta) + : Cand(Cand), DeltaKind(DeltaKind), Delta(Delta) {} + operator bool() const { return Cand != nullptr; } + }; + + friend raw_ostream &operator<<(raw_ostream &OS, const DeltaInfo &DI); + + DeltaInfo compressPath(Candidate &C, Candidate *Basis) const; + + Candidate *pickRewriteCandidate(Instruction *I) const; + void sortCandidateInstructions(); + static Constant *getIndexDelta(Candidate &C, Candidate &Basis); + static bool isSimilar(Candidate &C, Candidate &Basis, Candidate::DKind K); + + // Add Basis -> C in DependencyGraph and propagate + // C.Stride and C.Delta's dependency to C + void addDependency(Candidate &C, Candidate *Basis) { + if (Basis) + DependencyGraph[Basis->Ins].emplace_back(C.Ins); + + // If any candidate of Inst has a basis, then Inst will be rewritten, + // C must be rewritten after rewriting Inst, so we need to propagate + // the dependency to C + auto PropagateDependency = [&](Instruction *Inst) { + if (auto CandsIt = RewriteCandidates.find(Inst); + CandsIt != RewriteCandidates.end()) + if (std::any_of(CandsIt->second.begin(), CandsIt->second.end(), + [](Candidate *Cand) { return Cand->Basis; })) + DependencyGraph[Inst].emplace_back(C.Ins); + }; + + // If C has a variable delta and the delta is a candidate, + // propagate its dependency to C + if (auto *DeltaInst = dyn_cast_or_null(C.Delta)) + PropagateDependency(DeltaInst); + + // If the stride is a candidate, propagate its dependency to C + if (auto *StrideInst = dyn_cast(C.Stride)) + PropagateDependency(StrideInst); + }; }; +inline raw_ostream &operator<<(raw_ostream &OS, + const StraightLineStrengthReduce::Candidate &C) { + OS << "Ins: " << *C.Ins << "\n Base: " << *C.Base + << "\n Index: " << *C.Index << "\n Stride: " << *C.Stride + << "\n StrideSCEV: " << *C.StrideSCEV; + if (C.Basis) + OS << "\n Delta: " << *C.Delta << "\n Basis: \n [ " << *C.Basis << " ]"; + return OS; +} + +LLVM_DUMP_METHOD +inline raw_ostream & +operator<<(raw_ostream &OS, const StraightLineStrengthReduce::DeltaInfo &DI) { + OS << "Cand: " << *DI.Cand << "\n"; + OS << "Delta Kind: "; + switch (DI.DeltaKind) { + case StraightLineStrengthReduce::Candidate::IndexDelta: + OS << "Index"; + break; + case StraightLineStrengthReduce::Candidate::BaseDelta: + OS << "Base"; + break; + case StraightLineStrengthReduce::Candidate::StrideDelta: + OS << "Stride"; + break; + default: + break; + } + OS << "\nDelta: " << *DI.Delta; + return OS; +} + } // end anonymous namespace char StraightLineStrengthReduceLegacyPass::ID = 0; @@ -269,17 +606,288 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() { return new StraightLineStrengthReduceLegacyPass(); } -bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, - const Candidate &C) { - return (Basis.Ins != C.Ins && // skip the same instruction - // They must have the same type too. Basis.Base == C.Base - // doesn't guarantee their types are the same (PR23975). - Basis.Ins->getType() == C.Ins->getType() && - // Basis must dominate C in order to rewrite C with respect to Basis. - DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && - // They share the same base, stride, and candidate kind. - Basis.Base == C.Base && Basis.Stride == C.Stride && - Basis.CandidateKind == C.CandidateKind); +// A helper function that unifies the bitwidth of A and B. +static void unifyBitWidth(APInt &A, APInt &B) { + if (A.getBitWidth() < B.getBitWidth()) + A = A.sext(B.getBitWidth()); + else if (A.getBitWidth() > B.getBitWidth()) + B = B.sext(A.getBitWidth()); +} + +Constant *StraightLineStrengthReduce::getIndexDelta(Candidate &C, + Candidate &Basis) { + APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); + unifyBitWidth(Idx, BasisIdx); + APInt IndexOffset = Idx - BasisIdx; + IntegerType *DeltaType = + IntegerType::get(C.Ins->getContext(), IndexOffset.getBitWidth()); + return ConstantInt::get(DeltaType, IndexOffset); +} + +bool StraightLineStrengthReduce::isSimilar(Candidate &C, Candidate &Basis, + Candidate::DKind K) { + bool SameType = false; + switch (K) { + case Candidate::StrideDelta: + SameType = C.StrideSCEV->getType() == Basis.StrideSCEV->getType(); + break; + case Candidate::BaseDelta: + SameType = C.Base->getType() == Basis.Base->getType(); + break; + case Candidate::IndexDelta: + SameType = true; + break; + default:; + } + return SameType && Basis.Ins != C.Ins && + Basis.CandidateKind == C.CandidateKind; +} + +void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) { + auto SearchFrom = [this, &C](const CandidateDictTy::BBToCandsTy &BBToCands, + auto IsTarget) -> bool { + // Search dominating candidates by walking the immediate-dominator chain + // from the candidate's defining block upward. Visiting blocks in this + // order ensures we prefer the closest dominating basis. + const BasicBlock *BB = C.Ins->getParent(); + while (BB) { + auto It = BBToCands.find(BB); + if (It != BBToCands.end()) + for (Candidate *Basis : reverse(It->second)) + if (IsTarget(Basis)) + return true; + + const DomTreeNode *Node = DT->getNode(BB); + if (!Node) + break; + Node = Node->getIDom(); + BB = Node ? Node->getBlock() : nullptr; + } + return false; + }; + + // Priority: + // Constant Delta from Index > Constant Delta from Base > + // Constant Delta from Stride > Variable Delta from Base or Stride + // TODO: Change the priority to align with the cost model. + + // First, look for a constant index-diff basis + if (const auto *IndexDeltaCandidates = + CandidateDict.getCandidatesWithDeltaKind(C, Candidate::IndexDelta)) { + bool FoundConstDelta = + SearchFrom(*IndexDeltaCandidates, [&DT = DT, &C](Candidate *Basis) { + if (isSimilar(C, *Basis, Candidate::IndexDelta)) { + assert(DT->dominates(Basis->Ins, C.Ins)); + auto *Delta = getIndexDelta(C, *Basis); + if (!C.isProfitableRewrite(Delta, Candidate::IndexDelta)) + return false; + C.Basis = Basis; + C.DeltaKind = Candidate::IndexDelta; + C.Delta = Delta; + LLVM_DEBUG(dbgs() << "Found delta from Index " << *C.Delta << "\n"); + return true; + } + return false; + }); + if (FoundConstDelta) + return; + } + + // No constant-index-diff basis found. look for the best possible base-diff + // or stride-diff basis + // Base/Stride diffs not supported for form (B + i) * S + if (C.CandidateKind == Candidate::Mul) + return; + + auto For = [this, &C](Candidate::DKind K) { + // return true if find a Basis with constant delta and stop searching, + // return false if did not find a Basis or the delta is not a constant + // and continue searching for a Basis with constant delta + return [K, this, &C](Candidate *Basis) -> bool { + if (!isSimilar(C, *Basis, K)) + return false; + + assert(DT->dominates(Basis->Ins, C.Ins)); + const SCEV *BasisPart = + (K == Candidate::BaseDelta) ? Basis->Base : Basis->StrideSCEV; + const SCEV *CandPart = + (K == Candidate::BaseDelta) ? C.Base : C.StrideSCEV; + const SCEV *Diff = SE->getMinusSCEV(CandPart, BasisPart); + Value *AvailableVal = getNearestValueOfSCEV(Diff, C.Ins); + if (!AvailableVal) + return false; + + // Record delta if none has been found yet, or the new delta is + // a constant that is better than the existing delta. + if (!C.Delta || isa(AvailableVal)) { + C.Delta = AvailableVal; + C.Basis = Basis; + C.DeltaKind = K; + } + return isa(C.Delta); + }; + }; + + if (const auto *BaseDeltaCandidates = + CandidateDict.getCandidatesWithDeltaKind(C, Candidate::BaseDelta)) { + if (SearchFrom(*BaseDeltaCandidates, For(Candidate::BaseDelta))) { + LLVM_DEBUG(dbgs() << "Found delta from Base: " << *C.Delta << "\n"); + return; + } + } + + if (const auto *StrideDeltaCandidates = + CandidateDict.getCandidatesWithDeltaKind(C, Candidate::StrideDelta)) { + if (SearchFrom(*StrideDeltaCandidates, For(Candidate::StrideDelta))) { + LLVM_DEBUG(dbgs() << "Found delta from Stride: " << *C.Delta << "\n"); + return; + } + } + + // If we did not find a constant delta, we might have found a variable delta + if (C.Delta) { + LLVM_DEBUG({ + dbgs() << "Found delta from "; + if (C.DeltaKind == Candidate::BaseDelta) + dbgs() << "Base: "; + else + dbgs() << "Stride: "; + dbgs() << *C.Delta << "\n"; + }); + assert(C.DeltaKind != Candidate::InvalidDelta && C.Basis); + } +} + +// Compress the path from `Basis` to the deepest Basis in the Basis chain +// to avoid non-profitable data dependency and improve ILP. +// X = A + 1 +// Y = X + 1 +// Z = Y + 1 +// -> +// X = A + 1 +// Y = A + 2 +// Z = A + 3 +// Return the delta info for C aginst the new Basis +auto StraightLineStrengthReduce::compressPath( + Candidate &C, Candidate *Basis) const -> DeltaInfo { + if (!Basis || !Basis->Basis || C.CandidateKind == Candidate::Mul) + return {}; + Candidate *Root = Basis; + Value *NewDelta = nullptr; + auto NewKind = Candidate::InvalidDelta; + + while (Root->Basis) { + Candidate *NextRoot = Root->Basis; + if (C.Base == NextRoot->Base && C.StrideSCEV == NextRoot->StrideSCEV && + isSimilar(C, *NextRoot, Candidate::IndexDelta)) { + ConstantInt *CI = cast(getIndexDelta(C, *NextRoot)); + if (CI->isZero() || CI->isOne() || isa(C.StrideSCEV)) { + Root = NextRoot; + NewKind = Candidate::IndexDelta; + NewDelta = CI; + continue; + } + } + + const SCEV *CandPart = nullptr; + const SCEV *BasisPart = nullptr; + auto CurrKind = Candidate::InvalidDelta; + if (C.Base == NextRoot->Base && C.Index == NextRoot->Index) { + CandPart = C.StrideSCEV; + BasisPart = NextRoot->StrideSCEV; + CurrKind = Candidate::StrideDelta; + } else if (C.StrideSCEV == NextRoot->StrideSCEV && + C.Index == NextRoot->Index) { + CandPart = C.Base; + BasisPart = NextRoot->Base; + CurrKind = Candidate::BaseDelta; + } else + break; + + if (!isSimilar(C, *NextRoot, CurrKind)) + break; + + if (auto DeltaVal = + dyn_cast(SE->getMinusSCEV(CandPart, BasisPart))) { + Root = NextRoot; + NewDelta = DeltaVal->getValue(); + NewKind = CurrKind; + } else + break; + } + + if (Root != Basis) { + assert(NewKind != Candidate::InvalidDelta && NewDelta); + LLVM_DEBUG(dbgs() << "Found new Basis with " << *NewDelta + << " from path compression.\n"); + return {Root, NewKind, NewDelta}; + } + + return {}; +} + +// Topologically sort candidate instructions based on their relationship in +// dependency graph. +void StraightLineStrengthReduce::sortCandidateInstructions() { + SortedCandidateInsts.clear(); + // An instruction may have multiple candidates that get different Basis + // instructions, and each candidate can get dependencies from Basis and + // Stride when Stride will also be rewritten by SLSR. Hence, an instruction + // may have multiple dependencies. Use InDegree to ensure all dependencies + // processed before processing itself. + DenseMap InDegree; + for (auto &KV : DependencyGraph) { + InDegree.try_emplace(KV.first, 0); + + for (auto *Child : KV.second) { + InDegree[Child]++; + } + } + std::queue WorkList; + DenseSet Visited; + + for (auto &KV : DependencyGraph) + if (InDegree[KV.first] == 0) + WorkList.push(KV.first); + + while (!WorkList.empty()) { + Instruction *I = WorkList.front(); + WorkList.pop(); + if (!Visited.insert(I).second) + continue; + + SortedCandidateInsts.push_back(I); + + for (auto *Next : DependencyGraph[I]) { + auto &Degree = InDegree[Next]; + if (--Degree == 0) + WorkList.push(Next); + } + } + + assert(SortedCandidateInsts.size() == DependencyGraph.size() && + "Dependency graph should not have cycles"); +} + +auto StraightLineStrengthReduce::pickRewriteCandidate(Instruction *I) const + -> Candidate * { + // Return the candidate of instruction I that has the highest profit. + auto It = RewriteCandidates.find(I); + if (It == RewriteCandidates.end()) + return nullptr; + + Candidate *BestC = nullptr; + unsigned BestProfit = 0; + for (Candidate *C : reverse(It->second)) + if (C->Basis) { + unsigned Profit = C->getRewriteProfit(); + if (Profit > BestProfit) { + BestProfit = Profit; + BestC = C; + } + } + + return BestC; } static bool isGEPFoldable(GetElementPtrInst *GEP, @@ -299,8 +907,7 @@ static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride, } bool StraightLineStrengthReduce::isFoldable(const Candidate &C, - TargetTransformInfo *TTI, - const DataLayout *DL) { + TargetTransformInfo *TTI) { if (C.CandidateKind == Candidate::Add) return isAddFoldable(C.Base, C.Index, C.Stride, TTI); if (C.CandidateKind == Candidate::GEP) @@ -308,75 +915,39 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C, return false; } -// Returns true if GEP has zero or one non-zero index. -static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) { - unsigned NumNonZeroIndices = 0; - for (Use &Idx : GEP->indices()) { - ConstantInt *ConstIdx = dyn_cast(Idx); - if (ConstIdx == nullptr || !ConstIdx->isZero()) - ++NumNonZeroIndices; - } - return NumNonZeroIndices <= 1; -} - -bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) { - if (C.CandidateKind == Candidate::Add) { - // B + 1 * S or B + (-1) * S - return C.Index->isOne() || C.Index->isMinusOne(); - } - if (C.CandidateKind == Candidate::Mul) { - // (B + 0) * S - return C.Index->isZero(); - } - if (C.CandidateKind == Candidate::GEP) { - // (char*)B + S or (char*)B - S - return ((C.Index->isOne() || C.Index->isMinusOne()) && - hasOnlyOneNonZeroIndex(cast(C.Ins))); - } - return false; -} - -// TODO: We currently implement an algorithm whose time complexity is linear in -// the number of existing candidates. However, we could do better by using -// ScopedHashTable. Specifically, while traversing the dominator tree, we could -// maintain all the candidates that dominate the basic block being traversed in -// a ScopedHashTable. This hash table is indexed by the base and the stride of -// a candidate. Therefore, finding the immediate basis of a candidate boils down -// to one hash-table look up. void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, Instruction *I) { - Candidate C(CT, B, Idx, S, I); - // SLSR can complicate an instruction in two cases: - // - // 1. If we can fold I into an addressing mode, computing I is likely free or - // takes only one instruction. - // - // 2. I is already in a simplest form. For example, when - // X = B + 8 * S - // Y = B + S, - // rewriting Y to X - 7 * S is probably a bad idea. + // Record the SCEV of S that we may use it as a variable delta. + // Ensure that we rewrite C with a existing IR that reproduces delta value. + + Candidate C(CT, B, Idx, S, I, getAndRecordSCEV(S)); + // If we can fold I into an addressing mode, computing I is likely free or + // takes only one instruction. So, we don't need to analyze or rewrite it. // - // In the above cases, we still add I to the candidate list so that I can be - // the basis of other candidates, but we leave I's basis blank so that I - // won't be rewritten. - if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) { - // Try to compute the immediate basis of C. - unsigned NumIterations = 0; - // Limit the scan radius to avoid running in quadratice time. - static const unsigned MaxNumIterations = 50; - for (auto Basis = Candidates.rbegin(); - Basis != Candidates.rend() && NumIterations < MaxNumIterations; - ++Basis, ++NumIterations) { - if (isBasisFor(*Basis, C)) { - C.Basis = &(*Basis); - break; - } + // Currently, this algorithm can at best optimize complex computations into + // a `variable +/* constant` form. However, some targets have stricter + // constraints on the their addressing mode. + // For example, a `variable + constant` can only be folded to an addressing + // mode if the constant falls within a certain range. + // So, we also check if the instruction is already high efficient enough + // for the strength reduction algorithm. + if (!isFoldable(C, TTI) && !C.isHighEfficiency()) { + setBasisAndDeltaFor(C); + + // Compress unnecessary rewrite to improve ILP + if (auto Res = compressPath(C, C.Basis)) { + C.Basis = Res.Cand; + C.DeltaKind = Res.DeltaKind; + C.Delta = Res.Delta; } } // Regardless of whether we find a basis for C, we need to push C to the // candidate list so that it can be the basis of other candidates. + LLVM_DEBUG(dbgs() << "Allocated Candidate: " << C << "\n"); Candidates.push_back(C); + RewriteCandidates[C.Ins].push_back(&Candidates.back()); + CandidateDict.add(Candidates.back()); } void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( @@ -475,54 +1046,6 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( } } -void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( - const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, - Instruction *I) { - // I = B + sext(Idx *nsw S) * ElementSize - // = B + (sext(Idx) * sext(S)) * ElementSize - // = B + (sext(Idx) * ElementSize) * sext(S) - // Casting to IntegerType is safe because we skipped vector GEPs. - IntegerType *PtrIdxTy = cast(DL->getIndexType(I->getType())); - ConstantInt *ScaledIdx = ConstantInt::get( - PtrIdxTy, Idx->getSExtValue() * (int64_t)ElementSize, true); - allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I); -} - -void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx, - const SCEV *Base, - uint64_t ElementSize, - GetElementPtrInst *GEP) { - // At least, ArrayIdx = ArrayIdx *nsw 1. - allocateCandidatesAndFindBasisForGEP( - Base, ConstantInt::get(cast(ArrayIdx->getType()), 1), - ArrayIdx, ElementSize, GEP); - Value *LHS = nullptr; - ConstantInt *RHS = nullptr; - // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx - // itself. This would allow us to handle the shl case for free. However, - // matching SCEVs has two issues: - // - // 1. this would complicate rewriting because the rewriting procedure - // would have to translate SCEVs back to IR instructions. This translation - // is difficult when LHS is further evaluated to a composite SCEV. - // - // 2. ScalarEvolution is designed to be control-flow oblivious. It tends - // to strip nsw/nuw flags which are critical for SLSR to trace into - // sext'ed multiplication. - if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) { - // SLSR is currently unsafe if i * S may overflow. - // GEP = Base + sext(LHS *nsw RHS) * ElementSize - allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP); - } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) { - // GEP = Base + sext(LHS <getBitWidth(), 1); - ConstantInt *PowerOf2 = - ConstantInt::get(RHS->getContext(), One << RHS->getValue()); - allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP); - } -} - void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( GetElementPtrInst *GEP) { // TODO: handle vector GEPs @@ -546,11 +1069,14 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( const SCEV *BaseExpr = SE->getGEPExpr(cast(GEP), IndexExprs); Value *ArrayIdx = GEP->getOperand(I); uint64_t ElementSize = GTI.getSequentialElementStride(*DL); + IntegerType *PtrIdxTy = cast(DL->getIndexType(GEP->getType())); + ConstantInt *ElementSizeIdx = ConstantInt::get(PtrIdxTy, ElementSize, true); if (ArrayIdx->getType()->getIntegerBitWidth() <= DL->getIndexSizeInBits(GEP->getAddressSpace())) { // Skip factoring if ArrayIdx is wider than the index size, because // ArrayIdx is implicitly truncated to the index size. - factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); + allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx, + ArrayIdx, GEP); } // When ArrayIdx is the sext of a value, we try to factor that value as // well. Handling this case is important because array indices are @@ -561,118 +1087,158 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( DL->getIndexSizeInBits(GEP->getAddressSpace())) { // Skip factoring if TruncatedArrayIdx is wider than the pointer size, // because TruncatedArrayIdx is implicitly truncated to the pointer size. - factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP); + allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx, + TruncatedArrayIdx, GEP); } IndexExprs[I - 1] = OrigIndexExpr; } } -// A helper function that unifies the bitwidth of A and B. -static void unifyBitWidth(APInt &A, APInt &B) { - if (A.getBitWidth() < B.getBitWidth()) - A = A.sext(B.getBitWidth()); - else if (A.getBitWidth() > B.getBitWidth()) - B = B.sext(A.getBitWidth()); -} - Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, const Candidate &C, IRBuilder<> &Builder, const DataLayout *DL) { - APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); - unifyBitWidth(Idx, BasisIdx); - APInt IndexOffset = Idx - BasisIdx; + auto CreateMul = [&](Value *LHS, Value *RHS) { + if (ConstantInt *CR = dyn_cast(RHS)) { + const APInt &ConstRHS = CR->getValue(); + IntegerType *DeltaType = + IntegerType::get(C.Ins->getContext(), ConstRHS.getBitWidth()); + if (ConstRHS.isPowerOf2()) { + ConstantInt *Exponent = + ConstantInt::get(DeltaType, ConstRHS.logBase2()); + return Builder.CreateShl(LHS, Exponent); + } + if (ConstRHS.isNegatedPowerOf2()) { + ConstantInt *Exponent = + ConstantInt::get(DeltaType, (-ConstRHS).logBase2()); + return Builder.CreateNeg(Builder.CreateShl(LHS, Exponent)); + } + } - // Compute Bump = C - Basis = (i' - i) * S. - // Common case 1: if (i' - i) is 1, Bump = S. - if (IndexOffset == 1) - return C.Stride; - // Common case 2: if (i' - i) is -1, Bump = -S. - if (IndexOffset.isAllOnes()) - return Builder.CreateNeg(C.Stride); + return Builder.CreateMul(LHS, RHS); + }; - // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may - // have different bit widths. - IntegerType *DeltaType = - IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth()); - Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); - if (IndexOffset.isPowerOf2()) { - // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i). - ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2()); - return Builder.CreateShl(ExtendedStride, Exponent); + if (C.DeltaKind == Candidate::IndexDelta) { + APInt IndexOffset = cast(C.Delta)->getValue(); + // IndexDelta + // X = B + i * S + // Y = B + i` * S + // = B + (i + IndexDelta) * S + // = B + i * S + IndexDelta * S + // = X + IndexDelta * S + // Bump = (i' - i) * S + + // If Delta is 0, C is a fully redundant of C.Basis, + // just replace C.Ins with Basis.Ins + if (IndexOffset.isZero()) + return nullptr; + + // Compute Bump = C - Basis = (i' - i) * S. + // Common case 1: if (i' - i) is 1, Bump = S. + if (IndexOffset == 1) + return C.Stride; + // Common case 2: if (i' - i) is -1, Bump = -S. + if (IndexOffset.isAllOnes()) + return Builder.CreateNeg(C.Stride); + + IntegerType *DeltaType = + IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth()); + Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); + + return CreateMul(ExtendedStride, C.Delta); } - if (IndexOffset.isNegatedPowerOf2()) { - // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i). - ConstantInt *Exponent = - ConstantInt::get(DeltaType, (-IndexOffset).logBase2()); - return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent)); + + assert(C.DeltaKind == Candidate::StrideDelta || + C.DeltaKind == Candidate::BaseDelta); + assert(C.CandidateKind != Candidate::Mul); + // StrideDelta + // X = B + i * S + // Y = B + i * S' + // = B + i * (S + StrideDelta) + // = B + i * S + i * StrideDelta + // = X + i * StrideDelta + // Bump = i * (S' - S) + // + // BaseDelta + // X = B + i * S + // Y = B' + i * S + // = (B + BaseDelta) + i * S + // = X + BaseDelta + // Bump = (B' - B). + Value *Bump = C.Delta; + if (C.DeltaKind == Candidate::StrideDelta) { + // If this value is consumed by a GEP, promote StrideDelta before doing + // StrideDelta * Index to ensure the same semantics as the original GEP. + if (C.CandidateKind == Candidate::GEP) { + auto *GEP = cast(C.Ins); + Type *NewScalarIndexTy = + DL->getIndexType(GEP->getPointerOperandType()->getScalarType()); + Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy); + } + if (!C.Index->isOne()) { + Value *ExtendedIndex = + Builder.CreateSExtOrTrunc(C.Index, Bump->getType()); + Bump = CreateMul(Bump, ExtendedIndex); + } } - Constant *Delta = ConstantInt::get(DeltaType, IndexOffset); - return Builder.CreateMul(ExtendedStride, Delta); + return Bump; } -void StraightLineStrengthReduce::rewriteCandidateWithBasis( - const Candidate &C, const Candidate &Basis) { +void StraightLineStrengthReduce::rewriteCandidate(const Candidate &C) { if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter)) return; - assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && - C.Stride == Basis.Stride); - // We run rewriteCandidateWithBasis on all candidates in a post-order, so the - // basis of a candidate cannot be unlinked before the candidate. - assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked"); - - // An instruction can correspond to multiple candidates. Therefore, instead of - // simply deleting an instruction when we rewrite it, we mark its parent as - // nullptr (i.e. unlink it) so that we can skip the candidates whose - // instruction is already rewritten. - if (!C.Ins->getParent()) - return; + const Candidate &Basis = *C.Basis; + assert(C.Delta && C.CandidateKind == Basis.CandidateKind && + C.hasValidDelta(Basis)); IRBuilder<> Builder(C.Ins); Value *Bump = emitBump(Basis, C, Builder, DL); Value *Reduced = nullptr; // equivalent to but weaker than C.Ins - switch (C.CandidateKind) { - case Candidate::Add: - case Candidate::Mul: { - // C = Basis + Bump - Value *NegBump; - if (match(Bump, m_Neg(m_Value(NegBump)))) { - // If Bump is a neg instruction, emit C = Basis - (-Bump). - Reduced = Builder.CreateSub(Basis.Ins, NegBump); - // We only use the negative argument of Bump, and Bump itself may be - // trivially dead. - RecursivelyDeleteTriviallyDeadInstructions(Bump); - } else { - // It's tempting to preserve nsw on Bump and/or Reduced. However, it's - // usually unsound, e.g., - // - // X = (-2 +nsw 1) *nsw INT_MAX - // Y = (-2 +nsw 3) *nsw INT_MAX - // => - // Y = X + 2 * INT_MAX - // - // Neither + and * in the resultant expression are nsw. - Reduced = Builder.CreateAdd(Basis.Ins, Bump); + // If delta is 0, C is a fully redundant of Basis, and Bump is nullptr, + // just replace C.Ins with Basis.Ins + if (!Bump) + Reduced = Basis.Ins; + else { + switch (C.CandidateKind) { + case Candidate::Add: + case Candidate::Mul: { + // C = Basis + Bump + Value *NegBump; + if (match(Bump, m_Neg(m_Value(NegBump)))) { + // If Bump is a neg instruction, emit C = Basis - (-Bump). + Reduced = Builder.CreateSub(Basis.Ins, NegBump); + // We only use the negative argument of Bump, and Bump itself may be + // trivially dead. + RecursivelyDeleteTriviallyDeadInstructions(Bump); + } else { + // It's tempting to preserve nsw on Bump and/or Reduced. However, it's + // usually unsound, e.g., + // + // X = (-2 +nsw 1) *nsw INT_MAX + // Y = (-2 +nsw 3) *nsw INT_MAX + // => + // Y = X + 2 * INT_MAX + // + // Neither + and * in the resultant expression are nsw. + Reduced = Builder.CreateAdd(Basis.Ins, Bump); + } + break; } - break; - } - case Candidate::GEP: { - bool InBounds = cast(C.Ins)->isInBounds(); - // C = (char *)Basis + Bump - Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds); - break; + case Candidate::GEP: { + bool InBounds = cast(C.Ins)->isInBounds(); + // C = (char *)Basis + Bump + Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds); + break; + } + default: + llvm_unreachable("C.CandidateKind is invalid"); + }; + Reduced->takeName(C.Ins); } - default: - llvm_unreachable("C.CandidateKind is invalid"); - }; - Reduced->takeName(C.Ins); C.Ins->replaceAllUsesWith(Reduced); - // Unlink C.Ins so that we can skip other candidates also corresponding to - // C.Ins. The actual deletion is postponed to the end of runOnFunction. - C.Ins->removeFromParent(); - UnlinkedInstructions.push_back(C.Ins); + DeadInstructions.push_back(C.Ins); } bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) { @@ -686,33 +1252,43 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) { } bool StraightLineStrengthReduce::runOnFunction(Function &F) { + LLVM_DEBUG(dbgs() << "SLSR on Function: " << F.getName() << "\n"); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (const auto Node : depth_first(DT)) for (auto &I : *(Node->getBlock())) allocateCandidatesAndFindBasis(&I); - // Rewrite candidates in the reverse depth-first order. This order makes sure - // a candidate being rewritten is not a basis for any other candidate. - while (!Candidates.empty()) { - const Candidate &C = Candidates.back(); - if (C.Basis != nullptr) { - rewriteCandidateWithBasis(C, *C.Basis); - } - Candidates.pop_back(); - } - - // Delete all unlink instructions. - for (auto *UnlinkedInst : UnlinkedInstructions) { - for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) { - Value *Op = UnlinkedInst->getOperand(I); - UnlinkedInst->setOperand(I, nullptr); - RecursivelyDeleteTriviallyDeadInstructions(Op); - } - UnlinkedInst->deleteValue(); + // Build the dependency graph and sort candidate instructions from dependency + // roots to leaves + for (auto &C : Candidates) { + if (DependencyGraph.find(C.Ins) == DependencyGraph.end()) + DependencyGraph[C.Ins] = {}; + addDependency(C, C.Basis); } - bool Ret = !UnlinkedInstructions.empty(); - UnlinkedInstructions.clear(); + sortCandidateInstructions(); + + // Rewrite candidates in the topological order that rewrites a Candidate + // always before rewriting its Basis + for (Instruction *I : reverse(SortedCandidateInsts)) + if (Candidate *C = pickRewriteCandidate(I)) + rewriteCandidate(*C); + + for (auto *DeadIns : DeadInstructions) + // A dead instruction may be another dead instruction's op, + // don't delete an instruction twice + if (DeadIns->getParent()) + RecursivelyDeleteTriviallyDeadInstructions(DeadIns); + + bool Ret = !DeadInstructions.empty(); + DeadInstructions.clear(); + DependencyGraph.clear(); + RewriteCandidates.clear(); + SortedCandidateInsts.clear(); + // First clear all references to candidates in the list + CandidateDict.clear(); + // Then destroy the list + Candidates.clear(); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ebbeab94066d6..ef7a13819a799 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -541,10 +541,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_lshr_b32 s2, s0, 16 ; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 ; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX908-NEXT: v_mov_b32_e32 v0, 0 +; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX908-NEXT: s_or_b32 s14, s14, 28 ; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -610,13 +609,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s22, s20, s9 ; GFX908-NEXT: s_addc_u32 s23, s21, s13 -; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc +; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:16 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc +; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:20 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:24 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:28 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v17 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 @@ -710,7 +709,6 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GFX90A-NEXT: s_or_b32 s14, s14, 28 ; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 @@ -771,13 +769,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_add_u32 s22, s20, s9 ; GFX90A-NEXT: s_addc_u32 s23, s21, s13 -; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc +; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:16 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:20 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:24 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:28 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll index af1c64321222b..1b447571efaf2 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn -amdgpu-scalar-ir-passes=false < %s | FileCheck %s ; Test for a bug where DAGCombiner::ReassociateOps() was creating adds ; with offset in the first operand and base pointers in the second. diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index b443e654350c5..d3a31f7133ab9 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2417,7 +2417,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2442,7 +2442,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -2553,7 +2553,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2599,7 +2599,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index ab38bd21994ec..c96538b557cf6 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5 +; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX7-NEXT: v_mul_u32_u24_e32 v3, v6, v3 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 +; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 +; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 -; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 +; GFX8-NEXT: v_mad_u16 v6, v6, v7, v8 ; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 @@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 -; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 +; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3 -; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v3, v0 +; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_bfe_i32 v4, v5, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v3, v4, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -3429,32 +3429,34 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-TRUE16-NEXT: s_clause 0x1 -; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] -; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3] +; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1] +; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v3, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l ; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l -; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l +; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.h +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 ; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l +; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v4.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -3473,24 +3475,25 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 -; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v2, v3 -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8 -; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v6, v1, 0, 8 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8 +; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v0 ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v5, v6, 0, 8 -; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 ; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2 +; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v3, v2 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v5, v2 +; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v7, v2 +; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2 ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v2 ; GFX11-DL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 50f0a39802270..02cca4abc87b4 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1684,7 +1684,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-NEXT: v_add3_u32 v1, v1, v17, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -1735,7 +1735,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-DL-NEXT: v_add3_u32 v1, v1, v17, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1789,7 +1789,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v13, v3, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 85a9aba1a0e51..22b46671e7649 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -365,107 +365,110 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v10, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x5000 +; GFX8-NEXT: s_movk_i32 s0, 0x2800 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v10, 0 +; GFX8-NEXT: v_mov_b32_e32 v6, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v11, 0 -; GFX8-NEXT: v_mov_b32_e32 v13, 0x7f +; GFX8-NEXT: v_mov_b32_e32 v7, 0 +; GFX8-NEXT: v_mov_b32_e32 v11, 0x7f +; GFX8-NEXT: s_movk_i32 s1, 0x800 +; GFX8-NEXT: s_movk_i32 s2, 0x1000 +; GFX8-NEXT: s_movk_i32 s3, 0x1800 +; GFX8-NEXT: s_movk_i32 s4, 0x2000 ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s0, 0 +; GFX8-NEXT: s_mov_b32 s5, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[4:5] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[6:7] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] -; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[20:21] -; GFX8-NEXT: s_addk_i32 s0, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s0, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v24, vcc, v14, v10 -; GFX8-NEXT: v_addc_u32_e32 v25, vcc, v15, v11, vcc -; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfffff000, v2 -; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[22:23] -; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] -; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v16, v24 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v17, v25, vcc -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff800, v2 -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffd800, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[2:3] +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe000, v2 +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe800, v2 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[8:9] +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff000, v2 ; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xfffff800, v2 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[16:17] +; GFX8-NEXT: v_add_u32_e32 v16, vcc, s1, v2 +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc +; GFX8-NEXT: s_addk_i32 s5, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v14, v6 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v15, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, s2, v2 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[20:21] ; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] -; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, s3, v2 +; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(4) +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v12, v22 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v13, v23, vcc +; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v2 +; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] +; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(5) ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v18, v22 ; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v19, v23, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[2:3] +; GFX8-NEXT: v_add_u32_e32 v18, vcc, s0, v2 +; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] +; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(7) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v22 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v22 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v23, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc +; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8 +; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v15, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v14, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v4 -; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v18, v4 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v19, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, 1, v13 +; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 1, v11 ; GFX8-NEXT: s_and_b64 vcc, exec, vcc ; GFX8-NEXT: s_cbranch_vccz .LBB1_1 ; GFX8-NEXT: ; %bb.4: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v10 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -495,79 +498,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x5000, v0 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2800, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v7, 0x7f -; GFX900-NEXT: s_movk_i32 s2, 0xd000 -; GFX900-NEXT: s_movk_i32 s3, 0xe000 -; GFX900-NEXT: s_movk_i32 s4, 0xf000 +; GFX900-NEXT: s_movk_i32 s2, 0xf000 +; GFX900-NEXT: s_movk_i32 s3, 0x1000 +; GFX900-NEXT: s_movk_i32 s4, 0x2000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: s_mov_b32 s5, 0 ; GFX900-NEXT: v_mov_b32_e32 v2, v0 +; GFX900-NEXT: s_mov_b32 s5, 0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 -; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 -; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 -; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] -; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off -; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v14, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off +; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v14, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v17, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v10, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v15, vcc +; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], v12, v14 +; GFX900-NEXT: v_addc_co_u32_e64 v15, s[0:1], v13, v11, s[0:1] +; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s3, v2 +; GFX900-NEXT: v_add_co_u32_e64 v12, s[0:1], s4, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX900-NEXT: v_addc_co_u32_e64 v13, vcc, 0, v3, s[0:1] +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v4, v14 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v5, v15, vcc +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[12:13], off offset:-4096 +; GFX900-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(2) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v8, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v9, v17, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[12:13], off +; GFX900-NEXT: global_load_dwordx2 v[10:11], v[12:13], off offset:2048 ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc -; GFX900-NEXT: s_waitcnt vmcnt(5) -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc -; GFX900-NEXT: s_waitcnt vmcnt(4) -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v17, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v18, v8 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v19, v9, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v17, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v22, v8 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v23, v9, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -610,7 +610,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -621,29 +621,30 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffb800 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffe000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffc800 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xfffff000 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v4, 0xffffd800 -; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v4, 0xffffe800 -; GFX10-NEXT: s_clause 0x2 +; GFX10-NEXT: s_clause 0x5 ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off ; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 -; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[18:19], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[22:23], v[4:5], off +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0x1000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0x2000 +; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: global_load_dwordx2 v[24:25], v[8:9], off offset:-2048 +; GFX10-NEXT: v_add_co_u32 v26, vcc_lo, 0x2800, v4 +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[28:29], v[10:11], off offset:-2048 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off -; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off -; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off -; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off -; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off -; GFX10-NEXT: global_load_dwordx2 v[30:31], v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[32:33], v[4:5], off +; GFX10-NEXT: v_add_co_ci_u32_e32 v27, vcc_lo, 0, v5, vcc_lo +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: global_load_dwordx2 v[30:31], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[32:33], v[26:27], off ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: s_addk_i32 s1, 0x2000 @@ -651,25 +652,27 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_waitcnt vmcnt(10) ; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(6) -; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(9) +; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v11, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(7) +; GFX10-NEXT: v_add_co_u32 v2, s0, v18, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v19, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(6) ; GFX10-NEXT: v_add_co_u32 v2, s0, v20, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v21, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_add_co_u32 v2, s0, v22, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v23, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(4) -; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v24, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v25, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(3) -; GFX10-NEXT: v_add_co_u32 v2, s0, v26, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v27, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) +; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v28, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v29, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -717,77 +720,73 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2800, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f -; GFX90A-NEXT: s_movk_i32 s0, 0xd000 -; GFX90A-NEXT: s_movk_i32 s1, 0xe000 ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 +; GFX90A-NEXT: s_movk_i32 s3, 0x1000 +; GFX90A-NEXT: s_movk_i32 s4, 0x2000 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 -; GFX90A-NEXT: s_mov_b32 s3, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] +; GFX90A-NEXT: s_mov_b32 s5, 0 ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off -; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off -; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 -; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s1, v6 -; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[14:15], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[14:15], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[14:15], off +; GFX90A-NEXT: v_add_co_u32_e64 v18, s[0:1], s3, v6 +; GFX90A-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v7, s[0:1] +; GFX90A-NEXT: v_add_co_u32_e64 v20, s[0:1], s4, v6 +; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v6 +; GFX90A-NEXT: v_addc_co_u32_e64 v21, s[0:1], 0, v7, s[0:1] +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 +; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off +; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[8:9], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[8:9], off ; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s2, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[22:23], off offset:-2048 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[18:19], off offset:2048 +; GFX90A-NEXT: s_nop 0 +; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[20:21], off offset:2048 ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: s_addk_i32 s3, 0x2000 -; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff -; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: s_addk_i32 s5, 0x2000 +; GFX90A-NEXT: s_cmp_gt_u32 s5, 0x3fffff +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(7) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(5) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(4) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -822,7 +821,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX11-NEXT: ; =>This Loop Header: Depth=1 @@ -834,76 +833,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffc000 +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0xffffe000, v4 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xffffc000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xfffff000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[14:15], v[8:9], off offset:-4096 -; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffd000, v4 -; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v4, 0xffffe000 +; GFX11-NEXT: s_clause 0x6 +; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off ; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:-2048 -; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:-2048 -; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffe000, v4 +; GFX11-NEXT: global_load_b64 v[14:15], v[4:5], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[16:17], v[4:5], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[18:19], v[4:5], off +; GFX11-NEXT: global_load_b64 v[20:21], v[4:5], off offset:2048 +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v4, 0x2000 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, 0x1000, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo +; GFX11-NEXT: global_load_b64 v[26:27], v[22:23], off offset:-4096 +; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, 0x2000, v4 +; GFX11-NEXT: global_load_b64 v[24:25], v[24:25], off offset:2048 +; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v5, vcc_lo ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[20:21], v[16:17], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off -; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo -; GFX11-NEXT: s_clause 0x5 -; GFX11-NEXT: global_load_b64 v[18:19], v[18:19], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[16:17], v[16:17], off -; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[24:25], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[26:27], v[4:5], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[28:29], v[4:5], off +; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off +; GFX11-NEXT: global_load_b64 v[28:29], v[28:29], off offset:2048 ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo ; GFX11-NEXT: s_addk_i32 s1, 0x2000 ; GFX11-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(9) +; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0 +; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: v_add_co_u32 v2, s0, v10, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, s0 +; GFX11-NEXT: s_waitcnt vmcnt(7) +; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(6) -; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v17, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) ; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_add_co_u32 v2, s0, v16, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v17, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) ; GFX11-NEXT: v_add_co_u32 v2, s0, v24, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v25, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v28, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index 5aafb0f576fb4..a5984799c74d4 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -147,7 +147,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_5]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.274, align 8, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) @@ -170,7 +170,7 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.283, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) @@ -191,24 +191,24 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.294, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.257, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %470:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 ; CHECK-NEXT: KILL undef %470:sreg_64 ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.266, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_17:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_8]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.306, align 8, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] @@ -226,13 +226,13 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_23:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.324, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.330, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.336, addrspace 4) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index f3cb5a783a08c..f6922c75ff848 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -7,36 +7,27 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vmcnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v1, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -57,22 +48,20 @@ bb: define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vscnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: flat_store_dword v[2:3], v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, 1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vscnt_global: @@ -81,18 +70,14 @@ define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -115,22 +100,19 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: flat_store_dword v[2:3], v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 @@ -142,19 +124,15 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:-8 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_store_dword v[2:3], v1, off -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: global_load_dword v3, v2, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -179,38 +157,30 @@ bb: define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vmcnt_flat: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -231,22 +201,20 @@ bb: define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vscnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: flat_store_dword v[2:3], v1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, 1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v3 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vscnt_flat: @@ -255,18 +223,16 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v3, 1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, 1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: flat_store_dword v[0:1], v2 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -289,22 +255,19 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: flat_store_dword v[2:3], v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 @@ -316,21 +279,18 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: flat_load_dword v3, v[2:3] -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: flat_store_dword v[0:1], v4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -355,24 +315,20 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 +; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v4, s1 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc -; GFX8-NEXT: flat_store_dword v[2:3], v1 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 -; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc -; GFX8-NEXT: flat_load_dword v3, v[2:3] -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc -; GFX8-NEXT: s_waitcnt vmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc +; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 +; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc +; GFX8-NEXT: flat_load_dword v3, v[3:4] +; GFX8-NEXT: flat_store_dword v[1:2], v0 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -383,22 +339,18 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 +; GFX9-NEXT: v_mov_b32_e32 v0, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc +; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[4:5] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 -; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc -; GFX9-NEXT: flat_load_dword v3, v[2:3] -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc -; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v3 +; GFX9-NEXT: flat_store_dword v[0:1], v4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -423,34 +375,25 @@ bb: define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: load_vmcnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: load_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v2, v1, s[0:1] -; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v[0:1], v2, off +; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -468,36 +411,28 @@ bb: define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) { ; GFX8-LABEL: load_vmcnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 -; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: flat_load_dword v2, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_store_dword v[0:1], v4 +; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: load_vmcnt_flat: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v3, s1 -; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc +; GFX9-NEXT: flat_load_dword v2, v[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll index b3cbc3d049b04..9debcf10bb88b 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll @@ -13,7 +13,7 @@ entry: %tmp = sext i32 undef to i64 %arrayidx114 = getelementptr inbounds %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1 -; CHECK: %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1 +; CHECK: %tmp1 = getelementptr i8, ptr addrspace(1) %arrayidx114, i64 4 %tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4 ret void } diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll index 03edfdceab325..8662e61c95d66 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll @@ -46,9 +46,9 @@ define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_off ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP]] ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P1]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]] -; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[J2]] to i64 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP5]] +; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[I]] to i64 +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[OFFSET]], 2 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P1]], i64 [[TMP5]] ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P2]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void @@ -109,8 +109,8 @@ define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(pt ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J1]] ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P1]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]] -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J2]] +; CHECK-NEXT: [[J2:%.*]] = shl i32 [[I]], 2 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P1]], i32 [[J2]] ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P2]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll new file mode 100644 index 0000000000000..dc649673c525f --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll @@ -0,0 +1,155 @@ +; RUN: opt < %s -passes=slsr -S | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +; CHECK-LABEL: slsr_i8_zero_delta( +; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32 +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 64 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 96 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]] +define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) { + %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 + %load0 = load i8, ptr %getElem0.1 + + %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 + %load1 = load i8, ptr %getElem1.1 + + %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 + %load2 = load i8, ptr %getElem2.1 + + %out0 = add i8 %load0, %load1 + %out1 = add i8 %out0, %load2 + store i8 %out1, ptr %out + + ret void +} + +; CHECK-LABEL: slsr_i8_zero_delta_2( +; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]] +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 32 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP0]], i64 64 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]] +define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) { + %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add + %load0 = load i8, ptr %getElem0.0 + + %getElem1.0 = getelementptr i8, ptr %in, i64 %add + %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32 + %load1 = load i8, ptr %getElem1.1 + + %getElem2.0 = getelementptr i8, ptr %in, i64 %add + %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64 + %load2 = load i8, ptr %getElem2.1 + + %out0 = add i8 %load0, %load1 + %out1 = add i8 %out0, %load2 + store i8 %out1, ptr %out + + ret void +} + +; CHECK-LABEL: slsr_i8_base_delta( +; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32 +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]] +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1 +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]] +; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 2 +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]] +define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) { + %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 + %load0 = load i8, ptr %getElem0.1 + + %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1 + %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add + %getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64 + %load1 = load i8, ptr %getElem1.2 + + %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2 + %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add + %getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96 + %load2 = load i8, ptr %getElem2.2 + + %out0 = add i8 %load0, %load1 + %out1 = add i8 %out0, %load2 + store i8 %out1, ptr %out + + ret void +} + +; CHECK-LABEL: slsr_i8_index_delta( +; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32 +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]] +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 96 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2]] +define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) { + %getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add + %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 + %load0 = load i8, ptr %getElem0.1 + + %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 + %load1 = load i8, ptr %getElem1.1 + + %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 + %load2 = load i8, ptr %getElem2.1 + + %out0 = add i8 %load0, %load1 + %out1 = add i8 %out0, %load2 + store i8 %out1, ptr %out + + ret void +} + +; CHECK-LABEL: slsr_i8_stride_delta( +; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]]) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] +; CHECK-NEXT: [[GEP0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 32 +; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GEP0]] +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 1 +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds i8, ptr [[GEP1_0]], i64 64 +; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GEP1_1]] +; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds i8, ptr [[GEP]], i64 [[OFFSET]] +; CHECK-NEXT: [[GEP2_1:%.*]] = getelementptr inbounds i8, ptr [[GEP2_0]], i64 96 +; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GEP2_1]] +define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) { + %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add + %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 + %load0 = load i8, ptr %getElem0.1 + + %add1 = add i64 %add, 1 + %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1 + %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 + %load1 = load i8, ptr %getElem1.1 + + %add2 = add i64 %add, %offset + %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2 + %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 + %load2 = load i8, ptr %getElem2.1 + + %out0 = add i8 %load0, %load1 + %out1 = add i8 %out0, %load2 + store i8 %out1, ptr %out + + ret void +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll new file mode 100644 index 0000000000000..b236640802e83 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll @@ -0,0 +1,48 @@ +; RUN: opt < %s -passes=slsr -S | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX + +target triple = "nvptx64-nvidia-cuda" + +; Test SLSR can reuse the computation by complex variable delta. +; The original program needs 4 mul.wide.s32, after SLSR with +; variable-delta, it can reduce to 1 mul.wide.s32. +; PTX-COUNT-1: mul.wide.s32 +; PTX-NOT: mul.wide.s32 +define void @foo(ptr %a, ptr %b, i32 %j) { + %i.0 = load i32, ptr %a, align 8 + %i = add i32 %i.0, %j + ; CHECK: [[L:%.*]] = load i32, ptr %a, align 8 + ; CHECK: [[I:%.*]] = add i32 [[L]], %j + %gep.24 = getelementptr float, ptr %b, i32 %i + ; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]] + ; CHECK: store i32 0, ptr [[GEP0]] + store i32 0, ptr %gep.24 + %gep.24.sum1 = add i32 %i, %i + %gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1 + ; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64 + ; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2 + ; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]] + ; CHECK: store i32 1, ptr [[GEP1]] + store i32 1, ptr %gep.25 + %gep.26.sum3 = add i32 1, %i + %gep.27.sum = add i32 %gep.26.sum3, %i + %gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum + ; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4 + ; CHECK: store i32 2, ptr [[GEP2]] + store i32 2, ptr %gep.28 + %gep.28.sum = add i32 %gep.27.sum, %i + %gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum + ; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64 + ; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2 + ; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]] + ; CHECK: store i32 3, ptr [[GEP3]] + store i32 3, ptr %gep.29 + %gep.29.sum = add i32 %gep.28.sum, %i + %gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum + ; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64 + ; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2 + ; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]] + ; CHECK: store i32 4, ptr [[GEP4]] + store i32 4, ptr %gep.30 + ret void +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll new file mode 100644 index 0000000000000..4e8ae40f039aa --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll @@ -0,0 +1,35 @@ +; RUN: opt < %s -passes="slsr" -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" + +%struct.B = type { i16 } +%struct.A = type { %struct.B, %struct.B, %struct.B } + +define void @path_compression(i32 %a, ptr %base, i16 %r, i1 %cond) { +; CHECK-LABEL: @path_compression( +; CHECK: [[I:%.*]] = sext i32 %a to i64 +; CHECK: [[GEP1:%.*]] = getelementptr inbounds %struct.A, ptr %base, i64 [[I]] +; CHECK: br +; CHECK-LABEL: next +; compress the path to use GEP1 as the Basis instead of GEP2 +; CHECK: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2 +; CHECK: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 4 + + + %1 = sext i32 %a to i64 + %2 = add i64 %1, 1 + %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 + br i1 %cond, label %next, label %ret + +next: + %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1 + %offset = sub i64 %2, 1 + %getElem3 = getelementptr inbounds %struct.A, ptr %base, i64 %offset, i32 2 + store i16 %r, ptr %getElem1, align 2 + store i16 %r, ptr %getElem2, align 2 + store i16 %r, ptr %getElem3, align 2 + br label %ret + +ret: + ret void +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll new file mode 100644 index 0000000000000..7fa0e4f9474f1 --- /dev/null +++ b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll @@ -0,0 +1,32 @@ +; RUN: opt < %s -passes="slsr" -S | FileCheck %s + +target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" + +%struct.B = type { i16 } +%struct.A = type { %struct.B, %struct.B } + +define i32 @pick(i32 %0, ptr %addr) { +; `d` can be optimized by 2 approaches +; 1. a = 1 + 1 * %0 +; d = 1 + 8 * %0 +; = a + 7 * %0 +; 2. c = (8 * %0) + 3 +; d = (8 * %0) + 1 +; = c - 2 +; Pick candidate (2) as it can save 1 instruction from (7 * %0) +; +; CHECK-LABEL: pick +; CHECK: [[A:%.*]] = add i32 %0, 1 +; CHECK: [[B:%.*]] = shl i32 %0, 3 +; CHECK: [[C:%.*]] = add i32 [[B]], 3 +; CHECK: store i32 [[C]], ptr %addr +; CHECK: [[D:%.*]] = add i32 [[C]], -2 +; CHECK: ret i32 %d + + %a = add i32 %0, 1 + %b = shl i32 %0, 3 + %c = add i32 %b, 3 + store i32 %c, ptr %addr + %d = add i32 %b, 1 + ret i32 %d +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll index d85331f77b12f..5d23735237f9f 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll @@ -4,6 +4,8 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" +; Index Delta + define void @shl(i32 %b, i32 %s) { ; CHECK-LABEL: @shl( ; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]] @@ -171,3 +173,97 @@ define void @slsr_strided_add_128bit(i128 %b, i128 %s) { declare void @foo(i32) declare void @voo(<2 x i32>) declare void @bar(i128) + +; Stride Delta + +define void @stride_const(i32 %a, ptr %base, i16 %r) { +; Reuse add1 to compute add2 +; CHECK-LABEL: @stride_const( +; CHECK: [[I1:%.*]] = sext i32 %a to i64 +; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2 +; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64 +; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] +; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], 8 + %1 = sext i32 %a to i64 + %2 = mul i64 %1, 2 + %3 = add i64 %1, 4 + %4 = mul i64 %3, 2 + %baseInt = ptrtoint ptr %base to i64 + %add1 = add i64 %baseInt, %2 + %add2 = add i64 %baseInt, %4 + %addr1 = inttoptr i64 %add1 to ptr + %addr2 = inttoptr i64 %add2 to ptr + store i16 %r, ptr %addr1, align 2 + store i16 %r, ptr %addr2, align 2 + ret void +} + +define void @stride_var(i32 %a, ptr %base, i16 %r, i64 %n) { +; Reuse add1 to compute add2 to save a add.s64 +; CHECK-LABEL: @stride_var( +; CHECK: [[I1:%.*]] = sext i32 %a to i64 +; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2 +; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64 +; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] +; CHECK: [[I3:%.*]] = shl i64 %n, 1 +; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], [[I3]] + + %1 = sext i32 %a to i64 + %2 = mul i64 %1, 2 + %3 = add i64 %1, %n + %4 = mul i64 %3, 2 + %baseInt = ptrtoint ptr %base to i64 + %add1 = add i64 %baseInt, %2 + %add2 = add i64 %baseInt, %4 + %addr1 = inttoptr i64 %add1 to ptr + %addr2 = inttoptr i64 %add2 to ptr + store i16 %r, ptr %addr1, align 2 + store i16 %r, ptr %addr2, align 2 + ret void +} + +; Base Delta + +define void @base_const(i32 %a, ptr %base, i16 %r) { +; Reuse add1 to compute add2 +; CHECK-LABEL: @base_const( +; CHECK: [[I1:%.*]] = sext i32 %a to i64 +; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2 +; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64 +; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] +; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], 5 + + %1 = sext i32 %a to i64 + %2 = mul i64 %1, 2 + %baseInt = ptrtoint ptr %base to i64 + %add1 = add i64 %baseInt, %2 + %add2.0 = add i64 %baseInt, 5 + %add2 = add i64 %add2.0, %2 + %addr1 = inttoptr i64 %add1 to ptr + %addr2 = inttoptr i64 %add2 to ptr + store i16 %r, ptr %addr1, align 2 + store i16 %r, ptr %addr2, align 2 + ret void +} + +define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) { +; Reuse add1 to compute add2 +; CHECK-LABEL: @base_var( +; CHECK: [[I1:%.*]] = sext i32 %a to i64 +; CHECK: [[I2:%.*]] = mul i64 [[I1]], 2 +; CHECK: [[BI:%.*]] = ptrtoint ptr %base to i64 +; CHECK: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] +; CHECK: [[ADD2:%.*]] = add i64 [[ADD1]], %n + + %1 = sext i32 %a to i64 + %2 = mul i64 %1, 2 + %baseInt = ptrtoint ptr %base to i64 + %add1 = add i64 %baseInt, %2 + %add2.0 = add i64 %baseInt, %n + %add2 = add i64 %add2.0, %2 + %addr1 = inttoptr i64 %add1 to ptr + %addr2 = inttoptr i64 %add2 to ptr + store i16 %r, ptr %addr1, align 2 + store i16 %r, ptr %addr2, align 2 + ret void +} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll index 7cd45329c24fe..5eb8d058e295f 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll @@ -3,6 +3,42 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64-p:64:64:64-p1:32:32:32-p2:128:128:128:32" +; Index Delta +; Most of the original test cases in this file were optimized by Index-delta. +; After adding Base-delta and Stride-delta, most of the GEP test cases +; are optimized by Stride-delta now. The only case that GEP needs index-delta +; SLSR is to reuse address computation from a GEP with different pointee type. +; Once LLVM completely moves from typed GEP to PtrAdd, we can remove +; index-delta for GEP/PtrAdd. + +define void @index_delta(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { +; CHECK-LABEL: define void @index_delta( +; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { +; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]] +; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[INPUT]], i64 [[OFFSET]] +; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4 +; CHECK-NEXT: [[TMP:%.*]] = mul i64 [[OFFSET]], 3 +; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP]] +; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 +; CHECK-NEXT: ret void + + %add0 = add nsw i32 %b, 1 + %mul.1 = mul nsw i32 %add0, %n + %add.1 = add i32 %mul.1, %c + %offset = sext i32 %add.1 to i64 + %getElem = getelementptr i8, ptr %input, i64 %offset + store float %r, ptr %getElem, align 4 + %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset + store float %r, ptr %getElem.1, align 4 + + ret void +} + +; Stride Delta + ; foo(input[0]); ; foo(input[s]); ; foo(input[s * 2]); @@ -263,3 +299,115 @@ declare void @foo(ptr) declare void @bar(ptr) declare void @baz(ptr addrspace(1)) declare void @baz2(ptr addrspace(2)) + +define void @stride_const(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { +; CHECK-LABEL: define void @stride_const( +; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { +; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B]], [[N]] +; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[C]] +; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[N]] +; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[N]] +; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD_2]] to i64 +; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[OFFSET]] +; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 +; CHECK-NEXT: [[GETELEM_2:%.*]] = getelementptr i8, ptr [[GETELEM_1]], i64 16 +; CHECK-NEXT: store float [[R]], ptr [[GETELEM_2]], align 4 +; CHECK-NEXT: ret void +; + + %mul = mul nsw i32 %b, %n + %add = add i32 %mul, %c + %add.1 = add i32 %add, %n + %add.2 = add i32 %add.1, %n + + %offset = sext i32 %add.2 to i64 + %1 = getelementptr float, ptr %input, i64 %offset + store float %r, ptr %1, align 4 + + %offset3 = add i64 %offset, 4 + %2 = getelementptr float, ptr %input, i64 %offset3 + store float %r, ptr %2, align 4 + ret void +} + + +define void @stride_var(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { +; CHECK-LABEL: define void @stride_var( +; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { +; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1 +; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]] +; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]] +; CHECK-NEXT: [[I:%.*]] = sext i32 [[ADD1]] to i64 +; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I]] +; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64 +; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 +; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP2]] +; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 +; CHECK-NEXT: ret void +; +; Reuse getElem to compute getElem.1 and getElem.2 with variable offset n extracted from Stride + + %add0 = add nsw i32 %b, 1 + %mul.1 = mul nsw i32 %add0, %n + %add.1 = add i32 %mul.1, %c + %offset = sext i32 %add.1 to i64 + %getElem = getelementptr float, ptr %input, i64 %offset + store float %r, ptr %getElem, align 4 + + %mul = mul nsw i32 %b, %n + %add = add i32 %mul, %c + %add.11 = add i32 %add, %n + %add.2 = add i32 %add.11, %n + %offset1 = sext i32 %add.2 to i64 + %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset1 + store float %r, ptr %getElem.1, align 4 + + ret void +} + +; Base Delta + +%struct.B = type { i16 } +%struct.A = type { %struct.B, %struct.B } + +define void @base_const(i32 %a, ptr %base, i16 %r) { +; Reuse getElem1 to compute getElem2 +; CHECK-LABEL: define void @base_const( +; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: store i16 [[R]], ptr [[GEP1]], align 2 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2 +; CHECK-NEXT: store i16 [[R]], ptr [[GEP2]], align 2 +; CHECK-NEXT: ret void +; + + %1 = sext i32 %a to i64 + %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 + store i16 %r, ptr %getElem1, align 2 + %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1 + store i16 %r, ptr %getElem2, align 2 + ret void +} + +define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) { +; Reuse getElem1 to compute getElem2 +; CHECK-LABEL: define void @base_var( +; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64 +; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]] +; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM1]], align 2 +; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1]], i64 [[N]] +; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM2]], align 2 +; CHECK-NEXT: ret void +; + + %1 = sext i32 %a to i64 + %base1 = getelementptr inbounds i8, ptr %base, i64 %n + %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 + store i16 %r, ptr %getElem1, align 2 + %getElem2 = getelementptr inbounds %struct.A, ptr %base1, i64 %1 + store i16 %r, ptr %getElem2, align 2 + ret void +}