From 8bd8b5bc6153b9e674d745ea721c0f103bfcefd8 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 25 Nov 2025 18:30:56 +0000 Subject: [PATCH 1/3] Revert "Redesign Straight-Line Strength Reduction (SLSR) (#162930)" This reverts commit f67409c3ec7cd45c55656c8159bc42b3918f1116. --- .../Scalar/StraightLineStrengthReduce.cpp | 1140 ++++------------- .../AMDGPU/agpr-copy-no-free-registers.ll | 20 +- .../AMDGPU/dagcombine-reassociate-bug.ll | 2 +- llvm/test/CodeGen/AMDGPU/idot2.ll | 12 +- llvm/test/CodeGen/AMDGPU/idot4s.ll | 161 ++- llvm/test/CodeGen/AMDGPU/idot8u.ll | 6 +- .../AMDGPU/promote-constOffset-to-imm.ll | 468 +++---- .../AMDGPU/splitkit-getsubrangeformask.ll | 50 +- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 329 +++-- .../AMDGPU/pr23975.ll | 2 +- .../reassociate-geps-and-slsr-addrspace.ll | 10 +- .../NVPTX/slsr-i8-gep.ll | 271 ---- .../NVPTX/slsr-var-delta.ll | 70 - .../path-compression.ll | 35 - .../pick-candidate.ll | 32 - .../StraightLineStrengthReduce/slsr-add.ll | 120 -- .../StraightLineStrengthReduce/slsr-gep.ll | 149 --- 17 files changed, 839 insertions(+), 2038 deletions(-) delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll delete mode 100644 llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll diff --git a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp index e5399bdd767e2..e94ad1999e32a 100644 --- a/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp +++ b/llvm/lib/Transforms/Scalar/StraightLineStrengthReduce.cpp @@ -12,16 +12,17 @@ // effective in simplifying arithmetic statements derived from an unrolled loop. // It can also simplify the logic of SeparateConstOffsetFromGEP. // -// There are many optimizations we can perform in the domain of SLSR. -// We look for strength reduction candidates in the following forms: +// There are many optimizations we can perform in the domain of SLSR. This file +// for now contains only an initial step. Specifically, we look for strength +// reduction candidates in the following forms: // -// Form Add: B + i * S -// Form Mul: (B + i) * S -// Form GEP: &B[i * S] +// Form 1: B + i * S +// Form 2: (B + i) * S +// Form 3: &B[i * S] // // where S is an integer variable, and i is a constant integer. If we found two // candidates S1 and S2 in the same form and S1 dominates S2, we may rewrite S2 -// in a simpler way with respect to S1 (index delta). For example, +// in a simpler way with respect to S1. For example, // // S1: X = B + i * S // S2: Y = B + i' * S => X + (i' - i) * S @@ -34,29 +35,8 @@ // // Note: (i' - i) * S is folded to the extent possible. // -// For Add and GEP forms, we can also rewrite a candidate in a simpler way -// with respect to other dominating candidates if their B or S are different -// but other parts are the same. For example, -// -// Base Delta: -// S1: X = B + i * S -// S2: Y = B' + i * S => X + (B' - B) -// -// S1: X = &B [i * S] -// S2: Y = &B'[i * S] => X + (B' - B) -// -// Stride Delta: -// S1: X = B + i * S -// S2: Y = B + i * S' => X + i * (S' - S) -// -// S1: X = &B[i * S] -// S2: Y = &B[i * S'] => X + i * (S' - S) -// -// PS: Stride delta rewrite on Mul form is usually non-profitable, and Base -// delta rewrite sometimes is profitable, so we do not support them on Mul. -// // This rewriting is in general a good idea. The code patterns we focus on -// usually come from loop unrolling, so the delta is likely the same +// usually come from loop unrolling, so (i' - i) * S is likely the same // across iterations and can be reused. When that happens, the optimized form // takes only one add starting from the second iteration. // @@ -67,14 +47,19 @@ // TODO: // // - Floating point arithmetics when fast math is enabled. +// +// - SLSR may decrease ILP at the architecture level. Targets that are very +// sensitive to ILP may want to disable it. Having SLSR to consider ILP is +// left as future work. +// +// - When (i' - i) is constant but i and i' are not, we could still perform +// SLSR. #include "llvm/Transforms/Scalar/StraightLineStrengthReduce.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/ScalarEvolution.h" -#include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" #include "llvm/IR/Constants.h" @@ -101,19 +86,16 @@ #include #include #include -#include #include using namespace llvm; using namespace PatternMatch; -#define DEBUG_TYPE "slsr" - static const unsigned UnknownAddressSpace = std::numeric_limits::max(); DEBUG_COUNTER(StraightLineStrengthReduceCounter, "slsr-counter", - "Controls whether rewriteCandidate is executed."); + "Controls whether rewriteCandidateWithBasis is executed."); namespace { @@ -160,23 +142,15 @@ class StraightLineStrengthReduce { GEP, // &B[..][i * S][..] }; - enum DKind { - InvalidDelta, // reserved for the default constructor - IndexDelta, // Delta is a constant from Index - BaseDelta, // Delta is a constant or variable from Base - StrideDelta, // Delta is a constant or variable from Stride - }; - Candidate() = default; Candidate(Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, - Instruction *I, const SCEV *StrideSCEV) - : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I), - StrideSCEV(StrideSCEV) {} + Instruction *I) + : CandidateKind(CT), Base(B), Index(Idx), Stride(S), Ins(I) {} Kind CandidateKind = Invalid; const SCEV *Base = nullptr; - // TODO: Swap Index and Stride's name. + // Note that Index and Stride of a GEP candidate do not necessarily have the // same integer type. In that case, during rewriting, Stride will be // sign-extended or truncated to Index's type. @@ -203,164 +177,22 @@ class StraightLineStrengthReduce { // Points to the immediate basis of this candidate, or nullptr if we cannot // find any basis for this candidate. Candidate *Basis = nullptr; - - DKind DeltaKind = InvalidDelta; - - // Store SCEV of Stride to compute delta from different strides - const SCEV *StrideSCEV = nullptr; - - // Points to (Y - X) that will be used to rewrite this candidate. - Value *Delta = nullptr; - - /// Cost model: Evaluate the computational efficiency of the candidate. - /// - /// Efficiency levels (higher is better): - /// ZeroInst (5) - [Variable] or [Const] - /// OneInstOneVar (4) - [Variable + Const] or [Variable * Const] - /// OneInstTwoVar (3) - [Variable + Variable] or [Variable * Variable] - /// TwoInstOneVar (2) - [Const + Const * Variable] - /// TwoInstTwoVar (1) - [Variable + Const * Variable] - enum EfficiencyLevel : unsigned { - Unknown = 0, - TwoInstTwoVar = 1, - TwoInstOneVar = 2, - OneInstTwoVar = 3, - OneInstOneVar = 4, - ZeroInst = 5 - }; - - static EfficiencyLevel - getComputationEfficiency(Kind CandidateKind, const ConstantInt *Index, - const Value *Stride, const SCEV *Base = nullptr) { - bool IsConstantBase = false; - bool IsZeroBase = false; - // When evaluating the efficiency of a rewrite, if the Base's SCEV is - // not available, conservatively assume the base is not constant. - if (auto *ConstBase = dyn_cast_or_null(Base)) { - IsConstantBase = true; - IsZeroBase = ConstBase->getValue()->isZero(); - } - - bool IsConstantStride = isa(Stride); - bool IsZeroStride = - IsConstantStride && cast(Stride)->isZero(); - // All constants - if (IsConstantBase && IsConstantStride) - return ZeroInst; - - // (Base + Index) * Stride - if (CandidateKind == Mul) { - if (IsZeroStride) - return ZeroInst; - if (Index->isZero()) - return (IsConstantStride || IsConstantBase) ? OneInstOneVar - : OneInstTwoVar; - - if (IsConstantBase) - return IsZeroBase && (Index->isOne() || Index->isMinusOne()) - ? ZeroInst - : OneInstOneVar; - - if (IsConstantStride) { - auto *CI = cast(Stride); - return (CI->isOne() || CI->isMinusOne()) ? OneInstOneVar - : TwoInstOneVar; - } - return TwoInstTwoVar; - } - - // Base + Index * Stride - assert(CandidateKind == Add || CandidateKind == GEP); - if (Index->isZero() || IsZeroStride) - return ZeroInst; - - bool IsSimpleIndex = Index->isOne() || Index->isMinusOne(); - - if (IsConstantBase) - return IsZeroBase ? (IsSimpleIndex ? ZeroInst : OneInstOneVar) - : (IsSimpleIndex ? OneInstOneVar : TwoInstOneVar); - - if (IsConstantStride) - return IsZeroStride ? ZeroInst : OneInstOneVar; - - if (IsSimpleIndex) - return OneInstTwoVar; - - return TwoInstTwoVar; - } - - // Evaluate if the given delta is profitable to rewrite this candidate. - bool isProfitableRewrite(const Value *Delta, const DKind DeltaKind) const { - // This function cannot accurately evaluate the profit of whole expression - // with context. A candidate (B + I * S) cannot express whether this - // instruction needs to compute on its own (I * S), which may be shared - // with other candidates or may need instructions to compute. - // If the rewritten form has the same strength, still rewrite to - // (X + Delta) since it may expose more CSE opportunities on Delta, as - // unrolled loops usually have identical Delta for each unrolled body. - // - // Note, this function should only be used on Index Delta rewrite. - // Base and Stride delta need context info to evaluate the register - // pressure impact from variable delta. - return getComputationEfficiency(CandidateKind, Index, Stride, Base) <= - getRewriteEfficiency(Delta, DeltaKind); - } - - // Evaluate the rewrite efficiency of this candidate with its Basis - EfficiencyLevel getRewriteEfficiency() const { - return Basis ? getRewriteEfficiency(Delta, DeltaKind) : Unknown; - } - - // Evaluate the rewrite efficiency of this candidate with a given delta - EfficiencyLevel getRewriteEfficiency(const Value *Delta, - const DKind DeltaKind) const { - switch (DeltaKind) { - case BaseDelta: // [X + Delta] - return getComputationEfficiency( - CandidateKind, - ConstantInt::get(cast(Delta->getType()), 1), Delta); - case StrideDelta: // [X + Index * Delta] - return getComputationEfficiency(CandidateKind, Index, Delta); - case IndexDelta: // [X + Delta * Stride] - return getComputationEfficiency(CandidateKind, cast(Delta), - Stride); - default: - return Unknown; - } - } - - bool isHighEfficiency() const { - return getComputationEfficiency(CandidateKind, Index, Stride, Base) >= - OneInstOneVar; - } - - // Verify that this candidate has valid delta components relative to the - // basis - bool hasValidDelta(const Candidate &Basis) const { - switch (DeltaKind) { - case IndexDelta: - // Index differs, Base and Stride must match - return Base == Basis.Base && StrideSCEV == Basis.StrideSCEV; - case StrideDelta: - // Stride differs, Base and Index must match - return Base == Basis.Base && Index == Basis.Index; - case BaseDelta: - // Base differs, Stride and Index must match - return StrideSCEV == Basis.StrideSCEV && Index == Basis.Index; - default: - return false; - } - } }; bool runOnFunction(Function &F); private: - // Fetch straight-line basis for rewriting C, update C.Basis to point to it, - // and store the delta between C and its Basis in C.Delta. - void setBasisAndDeltaFor(Candidate &C); + // Returns true if Basis is a basis for C, i.e., Basis dominates C and they + // share the same base and stride. + bool isBasisFor(const Candidate &Basis, const Candidate &C); + // Returns whether the candidate can be folded into an addressing mode. - bool isFoldable(const Candidate &C, TargetTransformInfo *TTI); + bool isFoldable(const Candidate &C, TargetTransformInfo *TTI, + const DataLayout *DL); + + // Returns true if C is already in a simplest form and not worth being + // rewritten. + bool isSimplestForm(const Candidate &C); // Checks whether I is in a candidate form. If so, adds all the matching forms // to Candidates, and tries to find the immediate basis for each of them. @@ -384,6 +216,12 @@ class StraightLineStrengthReduce { // Allocate candidates and find bases for GetElementPtr instructions. void allocateCandidatesAndFindBasisForGEP(GetElementPtrInst *GEP); + // A helper function that scales Idx with ElementSize before invoking + // allocateCandidatesAndFindBasis. + void allocateCandidatesAndFindBasisForGEP(const SCEV *B, ConstantInt *Idx, + Value *S, uint64_t ElementSize, + Instruction *I); + // Adds the given form to Candidates, and finds its immediate // basis. void allocateCandidatesAndFindBasis(Candidate::Kind CT, const SCEV *B, @@ -391,7 +229,13 @@ class StraightLineStrengthReduce { Instruction *I); // Rewrites candidate C with respect to Basis. - void rewriteCandidate(const Candidate &C); + void rewriteCandidateWithBasis(const Candidate &C, const Candidate &Basis); + + // A helper function that factors ArrayIdx to a product of a stride and a + // constant index, and invokes allocateCandidatesAndFindBasis with the + // factorings. + void factorArrayIndex(Value *ArrayIdx, const SCEV *Base, uint64_t ElementSize, + GetElementPtrInst *GEP); // Emit code that computes the "bump" from Basis to C. static Value *emitBump(const Candidate &Basis, const Candidate &C, @@ -403,203 +247,12 @@ class StraightLineStrengthReduce { TargetTransformInfo *TTI = nullptr; std::list Candidates; - // Map from SCEV to instructions that represent the value, - // instructions are sorted in depth-first order. - DenseMap> SCEVToInsts; - - // Record the dependency between instructions. If C.Basis == B, we would have - // {B.Ins -> {C.Ins, ...}}. - MapVector> DependencyGraph; - - // Map between each instruction and its possible candidates. - DenseMap> RewriteCandidates; - - // All instructions that have candidates sort in topological order based on - // dependency graph, from roots to leaves. - std::vector SortedCandidateInsts; - - // Record all instructions that are already rewritten and will be removed - // later. - std::vector DeadInstructions; - - // Classify candidates against Delta kind - class CandidateDictTy { - public: - using CandsTy = SmallVector; - using BBToCandsTy = DenseMap; - - private: - // Index delta Basis must have the same (Base, StrideSCEV, Inst.Type) - using IndexDeltaKeyTy = std::tuple; - DenseMap IndexDeltaCandidates; - - // Base delta Basis must have the same (StrideSCEV, Index, Inst.Type) - using BaseDeltaKeyTy = std::tuple; - DenseMap BaseDeltaCandidates; - - // Stride delta Basis must have the same (Base, Index, Inst.Type) - using StrideDeltaKeyTy = std::tuple; - DenseMap StrideDeltaCandidates; - - public: - // TODO: Disable index delta on GEP after we completely move - // from typed GEP to PtrAdd. - const BBToCandsTy *getCandidatesWithDeltaKind(const Candidate &C, - Candidate::DKind K) const { - assert(K != Candidate::InvalidDelta); - if (K == Candidate::IndexDelta) { - IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, C.Ins->getType()); - auto It = IndexDeltaCandidates.find(IndexDeltaKey); - if (It != IndexDeltaCandidates.end()) - return &It->second; - } else if (K == Candidate::BaseDelta) { - BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, C.Ins->getType()); - auto It = BaseDeltaCandidates.find(BaseDeltaKey); - if (It != BaseDeltaCandidates.end()) - return &It->second; - } else { - assert(K == Candidate::StrideDelta); - StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, C.Ins->getType()); - auto It = StrideDeltaCandidates.find(StrideDeltaKey); - if (It != StrideDeltaCandidates.end()) - return &It->second; - } - return nullptr; - } - - // Pointers to C must remain valid until CandidateDict is cleared. - void add(Candidate &C) { - Type *ValueType = C.Ins->getType(); - BasicBlock *BB = C.Ins->getParent(); - IndexDeltaKeyTy IndexDeltaKey(C.Base, C.StrideSCEV, ValueType); - BaseDeltaKeyTy BaseDeltaKey(C.StrideSCEV, C.Index, ValueType); - StrideDeltaKeyTy StrideDeltaKey(C.Base, C.Index, ValueType); - IndexDeltaCandidates[IndexDeltaKey][BB].push_back(&C); - BaseDeltaCandidates[BaseDeltaKey][BB].push_back(&C); - StrideDeltaCandidates[StrideDeltaKey][BB].push_back(&C); - } - // Remove all mappings from set - void clear() { - IndexDeltaCandidates.clear(); - BaseDeltaCandidates.clear(); - StrideDeltaCandidates.clear(); - } - } CandidateDict; - - const SCEV *getAndRecordSCEV(Value *V) { - auto *S = SE->getSCEV(V); - if (isa(V) && !(isa(S) || - isa(S) || isa(S))) - SCEVToInsts[S].insert(cast(V)); - - return S; - } - - // Get the nearest instruction before CI that represents the value of S, - // return nullptr if no instruction is associated with S or S is not a - // reusable expression. - Value *getNearestValueOfSCEV(const SCEV *S, const Instruction *CI) const { - if (isa(S)) - return nullptr; - - if (auto *SU = dyn_cast(S)) - return SU->getValue(); - if (auto *SC = dyn_cast(S)) - return SC->getValue(); - - auto It = SCEVToInsts.find(S); - if (It == SCEVToInsts.end()) - return nullptr; - - // Instructions are sorted in depth-first order, so search for the nearest - // instruction by walking the list in reverse order. - for (Instruction *I : reverse(It->second)) - if (DT->dominates(I, CI)) - return I; - - return nullptr; - } - - struct DeltaInfo { - Candidate *Cand; - Candidate::DKind DeltaKind; - Value *Delta; - - DeltaInfo() - : Cand(nullptr), DeltaKind(Candidate::InvalidDelta), Delta(nullptr) {} - DeltaInfo(Candidate *Cand, Candidate::DKind DeltaKind, Value *Delta) - : Cand(Cand), DeltaKind(DeltaKind), Delta(Delta) {} - operator bool() const { return Cand != nullptr; } - }; - - friend raw_ostream &operator<<(raw_ostream &OS, const DeltaInfo &DI); - - DeltaInfo compressPath(Candidate &C, Candidate *Basis) const; - - Candidate *pickRewriteCandidate(Instruction *I) const; - void sortCandidateInstructions(); - static Constant *getIndexDelta(Candidate &C, Candidate &Basis); - static bool isSimilar(Candidate &C, Candidate &Basis, Candidate::DKind K); - - // Add Basis -> C in DependencyGraph and propagate - // C.Stride and C.Delta's dependency to C - void addDependency(Candidate &C, Candidate *Basis) { - if (Basis) - DependencyGraph[Basis->Ins].emplace_back(C.Ins); - - // If any candidate of Inst has a basis, then Inst will be rewritten, - // C must be rewritten after rewriting Inst, so we need to propagate - // the dependency to C - auto PropagateDependency = [&](Instruction *Inst) { - if (auto CandsIt = RewriteCandidates.find(Inst); - CandsIt != RewriteCandidates.end() && - llvm::any_of(CandsIt->second, - [](Candidate *Cand) { return Cand->Basis; })) - DependencyGraph[Inst].emplace_back(C.Ins); - }; - - // If C has a variable delta and the delta is a candidate, - // propagate its dependency to C - if (auto *DeltaInst = dyn_cast_or_null(C.Delta)) - PropagateDependency(DeltaInst); - - // If the stride is a candidate, propagate its dependency to C - if (auto *StrideInst = dyn_cast(C.Stride)) - PropagateDependency(StrideInst); - }; + // Temporarily holds all instructions that are unlinked (but not deleted) by + // rewriteCandidateWithBasis. These instructions will be actually removed + // after all rewriting finishes. + std::vector UnlinkedInstructions; }; -inline raw_ostream &operator<<(raw_ostream &OS, - const StraightLineStrengthReduce::Candidate &C) { - OS << "Ins: " << *C.Ins << "\n Base: " << *C.Base - << "\n Index: " << *C.Index << "\n Stride: " << *C.Stride - << "\n StrideSCEV: " << *C.StrideSCEV; - if (C.Basis) - OS << "\n Delta: " << *C.Delta << "\n Basis: \n [ " << *C.Basis << " ]"; - return OS; -} - -[[maybe_unused]] LLVM_DUMP_METHOD inline raw_ostream & -operator<<(raw_ostream &OS, const StraightLineStrengthReduce::DeltaInfo &DI) { - OS << "Cand: " << *DI.Cand << "\n"; - OS << "Delta Kind: "; - switch (DI.DeltaKind) { - case StraightLineStrengthReduce::Candidate::IndexDelta: - OS << "Index"; - break; - case StraightLineStrengthReduce::Candidate::BaseDelta: - OS << "Base"; - break; - case StraightLineStrengthReduce::Candidate::StrideDelta: - OS << "Stride"; - break; - default: - break; - } - OS << "\nDelta: " << *DI.Delta; - return OS; -} - } // end anonymous namespace char StraightLineStrengthReduceLegacyPass::ID = 0; @@ -616,290 +269,17 @@ FunctionPass *llvm::createStraightLineStrengthReducePass() { return new StraightLineStrengthReduceLegacyPass(); } -// A helper function that unifies the bitwidth of A and B. -static void unifyBitWidth(APInt &A, APInt &B) { - if (A.getBitWidth() < B.getBitWidth()) - A = A.sext(B.getBitWidth()); - else if (A.getBitWidth() > B.getBitWidth()) - B = B.sext(A.getBitWidth()); -} - -Constant *StraightLineStrengthReduce::getIndexDelta(Candidate &C, - Candidate &Basis) { - APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); - unifyBitWidth(Idx, BasisIdx); - APInt IndexDelta = Idx - BasisIdx; - IntegerType *DeltaType = - IntegerType::get(C.Ins->getContext(), IndexDelta.getBitWidth()); - return ConstantInt::get(DeltaType, IndexDelta); -} - -bool StraightLineStrengthReduce::isSimilar(Candidate &C, Candidate &Basis, - Candidate::DKind K) { - bool SameType = false; - switch (K) { - case Candidate::StrideDelta: - SameType = C.StrideSCEV->getType() == Basis.StrideSCEV->getType(); - break; - case Candidate::BaseDelta: - SameType = C.Base->getType() == Basis.Base->getType(); - break; - case Candidate::IndexDelta: - SameType = true; - break; - default:; - } - return SameType && Basis.Ins != C.Ins && - Basis.CandidateKind == C.CandidateKind; -} - -void StraightLineStrengthReduce::setBasisAndDeltaFor(Candidate &C) { - auto SearchFrom = [this, &C](const CandidateDictTy::BBToCandsTy &BBToCands, - auto IsTarget) -> bool { - // Search dominating candidates by walking the immediate-dominator chain - // from the candidate's defining block upward. Visiting blocks in this - // order ensures we prefer the closest dominating basis. - const BasicBlock *BB = C.Ins->getParent(); - while (BB) { - auto It = BBToCands.find(BB); - if (It != BBToCands.end()) - for (Candidate *Basis : reverse(It->second)) - if (IsTarget(Basis)) - return true; - - const DomTreeNode *Node = DT->getNode(BB); - if (!Node) - break; - Node = Node->getIDom(); - BB = Node ? Node->getBlock() : nullptr; - } - return false; - }; - - // Priority: - // Constant Delta from Index > Constant Delta from Base > - // Constant Delta from Stride > Variable Delta from Base or Stride - // TODO: Change the priority to align with the cost model. - - // First, look for a constant index-diff basis - if (const auto *IndexDeltaCandidates = - CandidateDict.getCandidatesWithDeltaKind(C, Candidate::IndexDelta)) { - bool FoundConstDelta = - SearchFrom(*IndexDeltaCandidates, [&](Candidate *Basis) { - if (isSimilar(C, *Basis, Candidate::IndexDelta)) { - assert(DT->dominates(Basis->Ins, C.Ins)); - auto *Delta = getIndexDelta(C, *Basis); - if (!C.isProfitableRewrite(Delta, Candidate::IndexDelta)) - return false; - C.Basis = Basis; - C.DeltaKind = Candidate::IndexDelta; - C.Delta = Delta; - LLVM_DEBUG(dbgs() << "Found delta from Index " << *C.Delta << "\n"); - return true; - } - return false; - }); - if (FoundConstDelta) - return; - } - - // No constant-index-diff basis found. look for the best possible base-diff - // or stride-diff basis - // Base/Stride diffs not supported for form (B + i) * S - if (C.CandidateKind == Candidate::Mul) - return; - - auto For = [this, &C](Candidate::DKind K) { - // return true if find a Basis with constant delta and stop searching, - // return false if did not find a Basis or the delta is not a constant - // and continue searching for a Basis with constant delta - return [K, this, &C](Candidate *Basis) -> bool { - if (!isSimilar(C, *Basis, K)) - return false; - - assert(DT->dominates(Basis->Ins, C.Ins)); - const SCEV *BasisPart = - (K == Candidate::BaseDelta) ? Basis->Base : Basis->StrideSCEV; - const SCEV *CandPart = - (K == Candidate::BaseDelta) ? C.Base : C.StrideSCEV; - const SCEV *Diff = SE->getMinusSCEV(CandPart, BasisPart); - Value *AvailableVal = getNearestValueOfSCEV(Diff, C.Ins); - if (!AvailableVal) - return false; - - // Record delta if none has been found yet, or the new delta is - // a constant that is better than the existing delta. - if (!C.Delta || isa(AvailableVal)) { - C.Delta = AvailableVal; - C.Basis = Basis; - C.DeltaKind = K; - } - return isa(C.Delta); - }; - }; - - if (const auto *BaseDeltaCandidates = - CandidateDict.getCandidatesWithDeltaKind(C, Candidate::BaseDelta)) { - if (SearchFrom(*BaseDeltaCandidates, For(Candidate::BaseDelta))) { - LLVM_DEBUG(dbgs() << "Found delta from Base: " << *C.Delta << "\n"); - return; - } - } - - if (const auto *StrideDeltaCandidates = - CandidateDict.getCandidatesWithDeltaKind(C, Candidate::StrideDelta)) { - if (SearchFrom(*StrideDeltaCandidates, For(Candidate::StrideDelta))) { - LLVM_DEBUG(dbgs() << "Found delta from Stride: " << *C.Delta << "\n"); - return; - } - } - - // If we did not find a constant delta, we might have found a variable delta - if (C.Delta) { - LLVM_DEBUG({ - dbgs() << "Found delta from "; - if (C.DeltaKind == Candidate::BaseDelta) - dbgs() << "Base: "; - else - dbgs() << "Stride: "; - dbgs() << *C.Delta << "\n"; - }); - assert(C.DeltaKind != Candidate::InvalidDelta && C.Basis); - } -} - -// Compress the path from `Basis` to the deepest Basis in the Basis chain -// to avoid non-profitable data dependency and improve ILP. -// X = A + 1 -// Y = X + 1 -// Z = Y + 1 -// -> -// X = A + 1 -// Y = A + 2 -// Z = A + 3 -// Return the delta info for C aginst the new Basis -auto StraightLineStrengthReduce::compressPath(Candidate &C, - Candidate *Basis) const - -> DeltaInfo { - if (!Basis || !Basis->Basis || C.CandidateKind == Candidate::Mul) - return {}; - Candidate *Root = Basis; - Value *NewDelta = nullptr; - auto NewKind = Candidate::InvalidDelta; - - while (Root->Basis) { - Candidate *NextRoot = Root->Basis; - if (C.Base == NextRoot->Base && C.StrideSCEV == NextRoot->StrideSCEV && - isSimilar(C, *NextRoot, Candidate::IndexDelta)) { - ConstantInt *CI = cast(getIndexDelta(C, *NextRoot)); - if (CI->isZero() || CI->isOne() || isa(C.StrideSCEV)) { - Root = NextRoot; - NewKind = Candidate::IndexDelta; - NewDelta = CI; - continue; - } - } - - const SCEV *CandPart = nullptr; - const SCEV *BasisPart = nullptr; - auto CurrKind = Candidate::InvalidDelta; - if (C.Base == NextRoot->Base && C.Index == NextRoot->Index) { - CandPart = C.StrideSCEV; - BasisPart = NextRoot->StrideSCEV; - CurrKind = Candidate::StrideDelta; - } else if (C.StrideSCEV == NextRoot->StrideSCEV && - C.Index == NextRoot->Index) { - CandPart = C.Base; - BasisPart = NextRoot->Base; - CurrKind = Candidate::BaseDelta; - } else - break; - - assert(CandPart && BasisPart); - if (!isSimilar(C, *NextRoot, CurrKind)) - break; - - if (auto DeltaVal = - dyn_cast(SE->getMinusSCEV(CandPart, BasisPart))) { - Root = NextRoot; - NewDelta = DeltaVal->getValue(); - NewKind = CurrKind; - } else - break; - } - - if (Root != Basis) { - assert(NewKind != Candidate::InvalidDelta && NewDelta); - LLVM_DEBUG(dbgs() << "Found new Basis with " << *NewDelta - << " from path compression.\n"); - return {Root, NewKind, NewDelta}; - } - - return {}; -} - -// Topologically sort candidate instructions based on their relationship in -// dependency graph. -void StraightLineStrengthReduce::sortCandidateInstructions() { - SortedCandidateInsts.clear(); - // An instruction may have multiple candidates that get different Basis - // instructions, and each candidate can get dependencies from Basis and - // Stride when Stride will also be rewritten by SLSR. Hence, an instruction - // may have multiple dependencies. Use InDegree to ensure all dependencies - // processed before processing itself. - DenseMap InDegree; - for (auto &KV : DependencyGraph) { - InDegree.try_emplace(KV.first, 0); - - for (auto *Child : KV.second) { - InDegree[Child]++; - } - } - std::queue WorkList; - DenseSet Visited; - - for (auto &KV : DependencyGraph) - if (InDegree[KV.first] == 0) - WorkList.push(KV.first); - - while (!WorkList.empty()) { - Instruction *I = WorkList.front(); - WorkList.pop(); - if (!Visited.insert(I).second) - continue; - - SortedCandidateInsts.push_back(I); - - for (auto *Next : DependencyGraph[I]) { - auto &Degree = InDegree[Next]; - if (--Degree == 0) - WorkList.push(Next); - } - } - - assert(SortedCandidateInsts.size() == DependencyGraph.size() && - "Dependency graph should not have cycles"); -} - -auto StraightLineStrengthReduce::pickRewriteCandidate(Instruction *I) const - -> Candidate * { - // Return the candidate of instruction I that has the highest profit. - auto It = RewriteCandidates.find(I); - if (It == RewriteCandidates.end()) - return nullptr; - - Candidate *BestC = nullptr; - auto BestEfficiency = Candidate::Unknown; - for (Candidate *C : reverse(It->second)) - if (C->Basis) { - auto Efficiency = C->getRewriteEfficiency(); - if (Efficiency > BestEfficiency) { - BestEfficiency = Efficiency; - BestC = C; - } - } - - return BestC; +bool StraightLineStrengthReduce::isBasisFor(const Candidate &Basis, + const Candidate &C) { + return (Basis.Ins != C.Ins && // skip the same instruction + // They must have the same type too. Basis.Base == C.Base + // doesn't guarantee their types are the same (PR23975). + Basis.Ins->getType() == C.Ins->getType() && + // Basis must dominate C in order to rewrite C with respect to Basis. + DT->dominates(Basis.Ins->getParent(), C.Ins->getParent()) && + // They share the same base, stride, and candidate kind. + Basis.Base == C.Base && Basis.Stride == C.Stride && + Basis.CandidateKind == C.CandidateKind); } static bool isGEPFoldable(GetElementPtrInst *GEP, @@ -919,7 +299,8 @@ static bool isAddFoldable(const SCEV *Base, ConstantInt *Index, Value *Stride, } bool StraightLineStrengthReduce::isFoldable(const Candidate &C, - TargetTransformInfo *TTI) { + TargetTransformInfo *TTI, + const DataLayout *DL) { if (C.CandidateKind == Candidate::Add) return isAddFoldable(C.Base, C.Index, C.Stride, TTI); if (C.CandidateKind == Candidate::GEP) @@ -927,39 +308,75 @@ bool StraightLineStrengthReduce::isFoldable(const Candidate &C, return false; } +// Returns true if GEP has zero or one non-zero index. +static bool hasOnlyOneNonZeroIndex(GetElementPtrInst *GEP) { + unsigned NumNonZeroIndices = 0; + for (Use &Idx : GEP->indices()) { + ConstantInt *ConstIdx = dyn_cast(Idx); + if (ConstIdx == nullptr || !ConstIdx->isZero()) + ++NumNonZeroIndices; + } + return NumNonZeroIndices <= 1; +} + +bool StraightLineStrengthReduce::isSimplestForm(const Candidate &C) { + if (C.CandidateKind == Candidate::Add) { + // B + 1 * S or B + (-1) * S + return C.Index->isOne() || C.Index->isMinusOne(); + } + if (C.CandidateKind == Candidate::Mul) { + // (B + 0) * S + return C.Index->isZero(); + } + if (C.CandidateKind == Candidate::GEP) { + // (char*)B + S or (char*)B - S + return ((C.Index->isOne() || C.Index->isMinusOne()) && + hasOnlyOneNonZeroIndex(cast(C.Ins))); + } + return false; +} + +// TODO: We currently implement an algorithm whose time complexity is linear in +// the number of existing candidates. However, we could do better by using +// ScopedHashTable. Specifically, while traversing the dominator tree, we could +// maintain all the candidates that dominate the basic block being traversed in +// a ScopedHashTable. This hash table is indexed by the base and the stride of +// a candidate. Therefore, finding the immediate basis of a candidate boils down +// to one hash-table look up. void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( Candidate::Kind CT, const SCEV *B, ConstantInt *Idx, Value *S, Instruction *I) { - // Record the SCEV of S that we may use it as a variable delta. - // Ensure that we rewrite C with a existing IR that reproduces delta value. - - Candidate C(CT, B, Idx, S, I, getAndRecordSCEV(S)); - // If we can fold I into an addressing mode, computing I is likely free or - // takes only one instruction. So, we don't need to analyze or rewrite it. + Candidate C(CT, B, Idx, S, I); + // SLSR can complicate an instruction in two cases: // - // Currently, this algorithm can at best optimize complex computations into - // a `variable +/* constant` form. However, some targets have stricter - // constraints on the their addressing mode. - // For example, a `variable + constant` can only be folded to an addressing - // mode if the constant falls within a certain range. - // So, we also check if the instruction is already high efficient enough - // for the strength reduction algorithm. - if (!isFoldable(C, TTI) && !C.isHighEfficiency()) { - setBasisAndDeltaFor(C); - - // Compress unnecessary rewrite to improve ILP - if (auto Res = compressPath(C, C.Basis)) { - C.Basis = Res.Cand; - C.DeltaKind = Res.DeltaKind; - C.Delta = Res.Delta; + // 1. If we can fold I into an addressing mode, computing I is likely free or + // takes only one instruction. + // + // 2. I is already in a simplest form. For example, when + // X = B + 8 * S + // Y = B + S, + // rewriting Y to X - 7 * S is probably a bad idea. + // + // In the above cases, we still add I to the candidate list so that I can be + // the basis of other candidates, but we leave I's basis blank so that I + // won't be rewritten. + if (!isFoldable(C, TTI, DL) && !isSimplestForm(C)) { + // Try to compute the immediate basis of C. + unsigned NumIterations = 0; + // Limit the scan radius to avoid running in quadratice time. + static const unsigned MaxNumIterations = 50; + for (auto Basis = Candidates.rbegin(); + Basis != Candidates.rend() && NumIterations < MaxNumIterations; + ++Basis, ++NumIterations) { + if (isBasisFor(*Basis, C)) { + C.Basis = &(*Basis); + break; + } } } // Regardless of whether we find a basis for C, we need to push C to the // candidate list so that it can be the basis of other candidates. - LLVM_DEBUG(dbgs() << "Allocated Candidate: " << C << "\n"); Candidates.push_back(C); - RewriteCandidates[C.Ins].push_back(&Candidates.back()); - CandidateDict.add(Candidates.back()); } void StraightLineStrengthReduce::allocateCandidatesAndFindBasis( @@ -1058,6 +475,54 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForMul( } } +void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( + const SCEV *B, ConstantInt *Idx, Value *S, uint64_t ElementSize, + Instruction *I) { + // I = B + sext(Idx *nsw S) * ElementSize + // = B + (sext(Idx) * sext(S)) * ElementSize + // = B + (sext(Idx) * ElementSize) * sext(S) + // Casting to IntegerType is safe because we skipped vector GEPs. + IntegerType *PtrIdxTy = cast(DL->getIndexType(I->getType())); + ConstantInt *ScaledIdx = ConstantInt::get( + PtrIdxTy, Idx->getSExtValue() * (int64_t)ElementSize, true); + allocateCandidatesAndFindBasis(Candidate::GEP, B, ScaledIdx, S, I); +} + +void StraightLineStrengthReduce::factorArrayIndex(Value *ArrayIdx, + const SCEV *Base, + uint64_t ElementSize, + GetElementPtrInst *GEP) { + // At least, ArrayIdx = ArrayIdx *nsw 1. + allocateCandidatesAndFindBasisForGEP( + Base, ConstantInt::get(cast(ArrayIdx->getType()), 1), + ArrayIdx, ElementSize, GEP); + Value *LHS = nullptr; + ConstantInt *RHS = nullptr; + // One alternative is matching the SCEV of ArrayIdx instead of ArrayIdx + // itself. This would allow us to handle the shl case for free. However, + // matching SCEVs has two issues: + // + // 1. this would complicate rewriting because the rewriting procedure + // would have to translate SCEVs back to IR instructions. This translation + // is difficult when LHS is further evaluated to a composite SCEV. + // + // 2. ScalarEvolution is designed to be control-flow oblivious. It tends + // to strip nsw/nuw flags which are critical for SLSR to trace into + // sext'ed multiplication. + if (match(ArrayIdx, m_NSWMul(m_Value(LHS), m_ConstantInt(RHS)))) { + // SLSR is currently unsafe if i * S may overflow. + // GEP = Base + sext(LHS *nsw RHS) * ElementSize + allocateCandidatesAndFindBasisForGEP(Base, RHS, LHS, ElementSize, GEP); + } else if (match(ArrayIdx, m_NSWShl(m_Value(LHS), m_ConstantInt(RHS)))) { + // GEP = Base + sext(LHS <getBitWidth(), 1); + ConstantInt *PowerOf2 = + ConstantInt::get(RHS->getContext(), One << RHS->getValue()); + allocateCandidatesAndFindBasisForGEP(Base, PowerOf2, LHS, ElementSize, GEP); + } +} + void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( GetElementPtrInst *GEP) { // TODO: handle vector GEPs @@ -1081,14 +546,11 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( const SCEV *BaseExpr = SE->getGEPExpr(cast(GEP), IndexExprs); Value *ArrayIdx = GEP->getOperand(I); uint64_t ElementSize = GTI.getSequentialElementStride(*DL); - IntegerType *PtrIdxTy = cast(DL->getIndexType(GEP->getType())); - ConstantInt *ElementSizeIdx = ConstantInt::get(PtrIdxTy, ElementSize, true); if (ArrayIdx->getType()->getIntegerBitWidth() <= DL->getIndexSizeInBits(GEP->getAddressSpace())) { // Skip factoring if ArrayIdx is wider than the index size, because // ArrayIdx is implicitly truncated to the index size. - allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx, - ArrayIdx, GEP); + factorArrayIndex(ArrayIdx, BaseExpr, ElementSize, GEP); } // When ArrayIdx is the sext of a value, we try to factor that value as // well. Handling this case is important because array indices are @@ -1099,159 +561,118 @@ void StraightLineStrengthReduce::allocateCandidatesAndFindBasisForGEP( DL->getIndexSizeInBits(GEP->getAddressSpace())) { // Skip factoring if TruncatedArrayIdx is wider than the pointer size, // because TruncatedArrayIdx is implicitly truncated to the pointer size. - allocateCandidatesAndFindBasis(Candidate::GEP, BaseExpr, ElementSizeIdx, - TruncatedArrayIdx, GEP); + factorArrayIndex(TruncatedArrayIdx, BaseExpr, ElementSize, GEP); } IndexExprs[I - 1] = OrigIndexExpr; } } +// A helper function that unifies the bitwidth of A and B. +static void unifyBitWidth(APInt &A, APInt &B) { + if (A.getBitWidth() < B.getBitWidth()) + A = A.sext(B.getBitWidth()); + else if (A.getBitWidth() > B.getBitWidth()) + B = B.sext(A.getBitWidth()); +} + Value *StraightLineStrengthReduce::emitBump(const Candidate &Basis, const Candidate &C, IRBuilder<> &Builder, const DataLayout *DL) { - auto CreateMul = [&](Value *LHS, Value *RHS) { - if (ConstantInt *CR = dyn_cast(RHS)) { - const APInt &ConstRHS = CR->getValue(); - IntegerType *DeltaType = - IntegerType::get(C.Ins->getContext(), ConstRHS.getBitWidth()); - if (ConstRHS.isPowerOf2()) { - ConstantInt *Exponent = - ConstantInt::get(DeltaType, ConstRHS.logBase2()); - return Builder.CreateShl(LHS, Exponent); - } - if (ConstRHS.isNegatedPowerOf2()) { - ConstantInt *Exponent = - ConstantInt::get(DeltaType, (-ConstRHS).logBase2()); - return Builder.CreateNeg(Builder.CreateShl(LHS, Exponent)); - } - } - - return Builder.CreateMul(LHS, RHS); - }; - - Value *Delta = C.Delta; - // If Delta is 0, C is a fully redundant of C.Basis, - // just replace C.Ins with Basis.Ins - if (ConstantInt *CI = dyn_cast(Delta); - CI && CI->getValue().isZero()) - return nullptr; - - if (C.DeltaKind == Candidate::IndexDelta) { - APInt IndexDelta = cast(C.Delta)->getValue(); - // IndexDelta - // X = B + i * S - // Y = B + i` * S - // = B + (i + IndexDelta) * S - // = B + i * S + IndexDelta * S - // = X + IndexDelta * S - // Bump = (i' - i) * S - - // Common case 1: if (i' - i) is 1, Bump = S. - if (IndexDelta == 1) - return C.Stride; - // Common case 2: if (i' - i) is -1, Bump = -S. - if (IndexDelta.isAllOnes()) - return Builder.CreateNeg(C.Stride); - - IntegerType *DeltaType = - IntegerType::get(Basis.Ins->getContext(), IndexDelta.getBitWidth()); - Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); - - return CreateMul(ExtendedStride, C.Delta); + APInt Idx = C.Index->getValue(), BasisIdx = Basis.Index->getValue(); + unifyBitWidth(Idx, BasisIdx); + APInt IndexOffset = Idx - BasisIdx; + + // Compute Bump = C - Basis = (i' - i) * S. + // Common case 1: if (i' - i) is 1, Bump = S. + if (IndexOffset == 1) + return C.Stride; + // Common case 2: if (i' - i) is -1, Bump = -S. + if (IndexOffset.isAllOnes()) + return Builder.CreateNeg(C.Stride); + + // Otherwise, Bump = (i' - i) * sext/trunc(S). Note that (i' - i) and S may + // have different bit widths. + IntegerType *DeltaType = + IntegerType::get(Basis.Ins->getContext(), IndexOffset.getBitWidth()); + Value *ExtendedStride = Builder.CreateSExtOrTrunc(C.Stride, DeltaType); + if (IndexOffset.isPowerOf2()) { + // If (i' - i) is a power of 2, Bump = sext/trunc(S) << log(i' - i). + ConstantInt *Exponent = ConstantInt::get(DeltaType, IndexOffset.logBase2()); + return Builder.CreateShl(ExtendedStride, Exponent); } - - assert(C.DeltaKind == Candidate::StrideDelta || - C.DeltaKind == Candidate::BaseDelta); - assert(C.CandidateKind != Candidate::Mul); - // StrideDelta - // X = B + i * S - // Y = B + i * S' - // = B + i * (S + StrideDelta) - // = B + i * S + i * StrideDelta - // = X + i * StrideDelta - // Bump = i * (S' - S) - // - // BaseDelta - // X = B + i * S - // Y = B' + i * S - // = (B + BaseDelta) + i * S - // = X + BaseDelta - // Bump = (B' - B). - Value *Bump = C.Delta; - if (C.DeltaKind == Candidate::StrideDelta) { - // If this value is consumed by a GEP, promote StrideDelta before doing - // StrideDelta * Index to ensure the same semantics as the original GEP. - if (C.CandidateKind == Candidate::GEP) { - auto *GEP = cast(C.Ins); - Type *NewScalarIndexTy = - DL->getIndexType(GEP->getPointerOperandType()->getScalarType()); - Bump = Builder.CreateSExtOrTrunc(Bump, NewScalarIndexTy); - } - if (!C.Index->isOne()) { - Value *ExtendedIndex = - Builder.CreateSExtOrTrunc(C.Index, Bump->getType()); - Bump = CreateMul(Bump, ExtendedIndex); - } + if (IndexOffset.isNegatedPowerOf2()) { + // If (i - i') is a power of 2, Bump = -sext/trunc(S) << log(i' - i). + ConstantInt *Exponent = + ConstantInt::get(DeltaType, (-IndexOffset).logBase2()); + return Builder.CreateNeg(Builder.CreateShl(ExtendedStride, Exponent)); } - return Bump; + Constant *Delta = ConstantInt::get(DeltaType, IndexOffset); + return Builder.CreateMul(ExtendedStride, Delta); } -void StraightLineStrengthReduce::rewriteCandidate(const Candidate &C) { +void StraightLineStrengthReduce::rewriteCandidateWithBasis( + const Candidate &C, const Candidate &Basis) { if (!DebugCounter::shouldExecute(StraightLineStrengthReduceCounter)) return; - const Candidate &Basis = *C.Basis; - assert(C.Delta && C.CandidateKind == Basis.CandidateKind && - C.hasValidDelta(Basis)); + assert(C.CandidateKind == Basis.CandidateKind && C.Base == Basis.Base && + C.Stride == Basis.Stride); + // We run rewriteCandidateWithBasis on all candidates in a post-order, so the + // basis of a candidate cannot be unlinked before the candidate. + assert(Basis.Ins->getParent() != nullptr && "the basis is unlinked"); + + // An instruction can correspond to multiple candidates. Therefore, instead of + // simply deleting an instruction when we rewrite it, we mark its parent as + // nullptr (i.e. unlink it) so that we can skip the candidates whose + // instruction is already rewritten. + if (!C.Ins->getParent()) + return; IRBuilder<> Builder(C.Ins); Value *Bump = emitBump(Basis, C, Builder, DL); Value *Reduced = nullptr; // equivalent to but weaker than C.Ins - // If delta is 0, C is a fully redundant of Basis, and Bump is nullptr, - // just replace C.Ins with Basis.Ins - if (!Bump) - Reduced = Basis.Ins; - else { - switch (C.CandidateKind) { - case Candidate::Add: - case Candidate::Mul: { - // C = Basis + Bump - Value *NegBump; - if (match(Bump, m_Neg(m_Value(NegBump)))) { - // If Bump is a neg instruction, emit C = Basis - (-Bump). - Reduced = Builder.CreateSub(Basis.Ins, NegBump); - // We only use the negative argument of Bump, and Bump itself may be - // trivially dead. - RecursivelyDeleteTriviallyDeadInstructions(Bump); - } else { - // It's tempting to preserve nsw on Bump and/or Reduced. However, it's - // usually unsound, e.g., - // - // X = (-2 +nsw 1) *nsw INT_MAX - // Y = (-2 +nsw 3) *nsw INT_MAX - // => - // Y = X + 2 * INT_MAX - // - // Neither + and * in the resultant expression are nsw. - Reduced = Builder.CreateAdd(Basis.Ins, Bump); - } - break; - } - case Candidate::GEP: { - bool InBounds = cast(C.Ins)->isInBounds(); - // C = (char *)Basis + Bump - Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds); - break; + switch (C.CandidateKind) { + case Candidate::Add: + case Candidate::Mul: { + // C = Basis + Bump + Value *NegBump; + if (match(Bump, m_Neg(m_Value(NegBump)))) { + // If Bump is a neg instruction, emit C = Basis - (-Bump). + Reduced = Builder.CreateSub(Basis.Ins, NegBump); + // We only use the negative argument of Bump, and Bump itself may be + // trivially dead. + RecursivelyDeleteTriviallyDeadInstructions(Bump); + } else { + // It's tempting to preserve nsw on Bump and/or Reduced. However, it's + // usually unsound, e.g., + // + // X = (-2 +nsw 1) *nsw INT_MAX + // Y = (-2 +nsw 3) *nsw INT_MAX + // => + // Y = X + 2 * INT_MAX + // + // Neither + and * in the resultant expression are nsw. + Reduced = Builder.CreateAdd(Basis.Ins, Bump); } - default: - llvm_unreachable("C.CandidateKind is invalid"); - }; - Reduced->takeName(C.Ins); + break; + } + case Candidate::GEP: { + bool InBounds = cast(C.Ins)->isInBounds(); + // C = (char *)Basis + Bump + Reduced = Builder.CreatePtrAdd(Basis.Ins, Bump, "", InBounds); + break; } + default: + llvm_unreachable("C.CandidateKind is invalid"); + }; + Reduced->takeName(C.Ins); C.Ins->replaceAllUsesWith(Reduced); - DeadInstructions.push_back(C.Ins); + // Unlink C.Ins so that we can skip other candidates also corresponding to + // C.Ins. The actual deletion is postponed to the end of runOnFunction. + C.Ins->removeFromParent(); + UnlinkedInstructions.push_back(C.Ins); } bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) { @@ -1265,42 +686,33 @@ bool StraightLineStrengthReduceLegacyPass::runOnFunction(Function &F) { } bool StraightLineStrengthReduce::runOnFunction(Function &F) { - LLVM_DEBUG(dbgs() << "SLSR on Function: " << F.getName() << "\n"); // Traverse the dominator tree in the depth-first order. This order makes sure // all bases of a candidate are in Candidates when we process it. for (const auto Node : depth_first(DT)) for (auto &I : *(Node->getBlock())) allocateCandidatesAndFindBasis(&I); - // Build the dependency graph and sort candidate instructions from dependency - // roots to leaves - for (auto &C : Candidates) { - DependencyGraph.try_emplace(C.Ins); - addDependency(C, C.Basis); + // Rewrite candidates in the reverse depth-first order. This order makes sure + // a candidate being rewritten is not a basis for any other candidate. + while (!Candidates.empty()) { + const Candidate &C = Candidates.back(); + if (C.Basis != nullptr) { + rewriteCandidateWithBasis(C, *C.Basis); + } + Candidates.pop_back(); + } + + // Delete all unlink instructions. + for (auto *UnlinkedInst : UnlinkedInstructions) { + for (unsigned I = 0, E = UnlinkedInst->getNumOperands(); I != E; ++I) { + Value *Op = UnlinkedInst->getOperand(I); + UnlinkedInst->setOperand(I, nullptr); + RecursivelyDeleteTriviallyDeadInstructions(Op); + } + UnlinkedInst->deleteValue(); } - sortCandidateInstructions(); - - // Rewrite candidates in the topological order that rewrites a Candidate - // always before rewriting its Basis - for (Instruction *I : reverse(SortedCandidateInsts)) - if (Candidate *C = pickRewriteCandidate(I)) - rewriteCandidate(*C); - - for (auto *DeadIns : DeadInstructions) - // A dead instruction may be another dead instruction's op, - // don't delete an instruction twice - if (DeadIns->getParent()) - RecursivelyDeleteTriviallyDeadInstructions(DeadIns); - - bool Ret = !DeadInstructions.empty(); - DeadInstructions.clear(); - DependencyGraph.clear(); - RewriteCandidates.clear(); - SortedCandidateInsts.clear(); - // First clear all references to candidates in the list - CandidateDict.clear(); - // Then destroy the list - Candidates.clear(); + bool Ret = !UnlinkedInstructions.empty(); + UnlinkedInstructions.clear(); return Ret; } diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index ef7a13819a799..ebbeab94066d6 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -541,9 +541,10 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_lshr_b32 s2, s0, 16 ; GFX908-NEXT: v_cvt_f32_f16_e32 v19, s2 ; GFX908-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 -; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 +; GFX908-NEXT: v_mov_b32_e32 v0, 0 ; GFX908-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX908-NEXT: s_or_b32 s14, s14, 28 ; GFX908-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX908-NEXT: v_mov_b32_e32 v1, 0 ; GFX908-NEXT: s_waitcnt vmcnt(0) @@ -609,13 +610,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s22, s20, s9 ; GFX908-NEXT: s_addc_u32 s23, s21, s13 -; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:16 glc +; GFX908-NEXT: global_load_dword v21, v17, s[22:23] offset:-12 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:20 glc +; GFX908-NEXT: global_load_dword v20, v17, s[22:23] offset:-8 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:24 glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:-4 glc ; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: global_load_dword v12, v17, s[22:23] offset:28 glc +; GFX908-NEXT: global_load_dword v12, v17, s[22:23] glc ; GFX908-NEXT: s_waitcnt vmcnt(0) ; GFX908-NEXT: ds_read_b64 v[12:13], v17 ; GFX908-NEXT: ds_read_b64 v[14:15], v0 @@ -709,6 +710,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_lshl_b64 s[6:7], s[4:5], 5 ; GFX90A-NEXT: s_lshl_b64 s[14:15], s[10:11], 5 ; GFX90A-NEXT: s_and_b64 s[0:1], exec, s[0:1] +; GFX90A-NEXT: s_or_b32 s14, s14, 28 ; GFX90A-NEXT: s_lshl_b64 s[16:17], s[8:9], 5 ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: v_readfirstlane_b32 s2, v18 @@ -769,13 +771,13 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_add_u32 s22, s20, s9 ; GFX90A-NEXT: s_addc_u32 s23, s21, s13 -; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:16 glc +; GFX90A-NEXT: global_load_dword v21, v19, s[22:23] offset:-12 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:20 glc +; GFX90A-NEXT: global_load_dword v20, v19, s[22:23] offset:-8 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:24 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:-4 glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] offset:28 glc +; GFX90A-NEXT: global_load_dword v14, v19, s[22:23] glc ; GFX90A-NEXT: s_waitcnt vmcnt(0) ; GFX90A-NEXT: ds_read_b64 v[14:15], v19 ; GFX90A-NEXT: ds_read_b64 v[16:17], v0 diff --git a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll index 1b447571efaf2..af1c64321222b 100644 --- a/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=amdgcn -amdgpu-scalar-ir-passes=false < %s | FileCheck %s +; RUN: llc -mtriple=amdgcn < %s | FileCheck %s ; Test for a bug where DAGCombiner::ReassociateOps() was creating adds ; with offset in the first operand and base pointers in the second. diff --git a/llvm/test/CodeGen/AMDGPU/idot2.ll b/llvm/test/CodeGen/AMDGPU/idot2.ll index bf65657ff841c..22907ca28c47f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot2.ll +++ b/llvm/test/CodeGen/AMDGPU/idot2.ll @@ -2396,7 +2396,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2417,7 +2417,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_u32_u24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2442,7 +2442,7 @@ define amdgpu_kernel void @udot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, @@ -2553,7 +2553,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NODL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-NODL-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-NODL-NEXT: v_add3_u32 v1, v4, v1, v3 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NODL-NEXT: s_endpgm ; @@ -2574,7 +2574,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_i32_i24_e32 v4, v2, v1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mad_i32_i24 v1, v2, v1, s0 -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v3 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -2599,7 +2599,7 @@ define amdgpu_kernel void @idot2_MultipleUses_mul2(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: v_mad_i32_i24 v0, v3, v0, s0 ; GFX10-DL-NEXT: v_mov_b32_e32 v3, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v2, v1 +; GFX10-DL-NEXT: v_add3_u32 v0, v2, v0, v1 ; GFX10-DL-NEXT: global_store_dword v3, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll index 1a22fa805a5a7..fd1f7b000472a 100644 --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -3268,19 +3268,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[4:7], 0 addr64 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_waitcnt vmcnt(1) -; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 ; GFX7-NEXT: v_bfe_i32 v1, v2, 0, 8 +; GFX7-NEXT: v_bfe_i32 v3, v2, 8, 8 +; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 +; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 ; GFX7-NEXT: v_bfe_u32 v6, v0, 8, 8 ; GFX7-NEXT: v_and_b32_e32 v3, 0xffff, v3 -; GFX7-NEXT: v_bfe_i32 v4, v2, 16, 8 -; GFX7-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX7-NEXT: v_and_b32_e32 v5, 0xff, v0 -; GFX7-NEXT: v_mul_u32_u24_e32 v3, v6, v3 +; GFX7-NEXT: v_mul_u32_u24_e32 v1, v1, v5 ; GFX7-NEXT: v_ashrrev_i32_e32 v2, 24, v2 ; GFX7-NEXT: v_bfe_u32 v7, v0, 16, 8 ; GFX7-NEXT: v_and_b32_e32 v4, 0xffff, v4 -; GFX7-NEXT: v_mad_u32_u24 v1, v1, v5, v3 +; GFX7-NEXT: v_mad_u32_u24 v1, v6, v3, v1 ; GFX7-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX7-NEXT: v_mad_u32_u24 v1, v7, v4, v1 ; GFX7-NEXT: v_and_b32_e32 v2, 0xffff, v2 @@ -3307,18 +3307,18 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 8, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v3 -; GFX8-NEXT: v_bfe_i32 v6, v3, 0, 8 -; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_bfe_i32 v7, v7, 0, 8 ; GFX8-NEXT: v_bfe_i32 v5, v5, 0, 8 -; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 -; GFX8-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v9, sext(v8) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v2 +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, sext(v3), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX8-NEXT: v_and_b32_e32 v8, 0xff, v8 ; GFX8-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX8-NEXT: v_mad_u16 v6, v6, v7, v8 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX8-NEXT: v_mad_u16 v6, v8, v7, v6 +; GFX8-NEXT: v_bfe_i32 v3, v3, 0, 8 ; GFX8-NEXT: v_mad_u16 v4, v4, v5, v6 ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 @@ -3337,19 +3337,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-NODL-NEXT: s_movk_i32 s0, 0xff ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(1) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-NODL-NEXT: s_waitcnt vmcnt(0) -; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-NODL-NEXT: v_bfe_i32 v4, v1, 0, 8 -; GFX9-NODL-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NODL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NODL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NODL-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX9-NODL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-NODL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-NODL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4 +; GFX9-NODL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 ; GFX9-NODL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-NODL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-NODL-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -3367,19 +3367,19 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_movk_i32 s0, 0xff ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 8, v2 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 -; GFX9-DL-NEXT: v_bfe_i32 v4, v1, 0, 8 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 0xff, v2 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v7, sext(v6) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-DL-NEXT: v_and_b32_sdwa v8, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v4, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-DL-NEXT: v_bfe_i32 v5, v5, 0, 8 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 0xff, v6 +; GFX9-DL-NEXT: v_and_b32_sdwa v7, v2, s0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 ; GFX9-DL-NEXT: v_bfe_i32 v3, v3, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v4, v5, v6 +; GFX9-DL-NEXT: v_mad_legacy_u16 v4, v6, v5, v4 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v8, v3, v4 +; GFX9-DL-NEXT: v_mad_legacy_u16 v3, v7, v3, v4 ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 ; GFX9-DL-NEXT: v_bfe_i32 v1, v1, 0, 16 @@ -3392,28 +3392,28 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0xff +; GFX10-DL-NEXT: v_mov_b32_e32 v6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 ; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 8, v1 +; GFX10-DL-NEXT: v_bfe_i32 v0, v1, 0, 8 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 8, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v1 -; GFX10-DL-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v7, 0xff, v2 -; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 8 -; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v3, 0xff, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 16, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v3, v0 -; GFX10-DL-NEXT: v_and_b32_sdwa v3, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-DL-NEXT: v_bfe_i32 v4, v5, 0, 8 +; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v3 +; GFX10-DL-NEXT: v_bfe_i32 v3, v4, 0, 8 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 0xff, v5 +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_bfe_i32 v6, v7, 0, 8 ; GFX10-DL-NEXT: v_bfe_i32 v1, v1, 0, 8 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 24, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v3, v4, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v4, v3, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v5, v6, v0 ; GFX10-DL-NEXT: v_mad_u16 v0, v1, v2, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -3429,34 +3429,32 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-TRUE16-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-DL-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-DL-TRUE16-NEXT: s_clause 0x1 -; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[0:1] -; GFX11-DL-TRUE16-NEXT: global_load_b32 v4, v0, s[2:3] +; GFX11-DL-TRUE16-NEXT: global_load_b32 v2, v0, s[0:1] +; GFX11-DL-TRUE16-NEXT: global_load_b32 v3, v0, s[2:3] ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v0, 8, v3 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 8, v2 +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v1, v2, 0, 8 ; GFX11-DL-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 8, v4 -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v3, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v3.h -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v0, 0, 8 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_3) -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v1.l -; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v4.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.l, 0xff, v3.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v5, 8, v3 +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v6.l, v2.h +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.h, 0xff, v3.h +; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v1.l, v0.l +; GFX11-DL-TRUE16-NEXT: v_and_b16 v0.h, 0xff, v5.l ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v5, v6, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4) -; GFX11-DL-TRUE16-NEXT: v_mul_lo_u16 v0.l, v0.l, v1.l -; GFX11-DL-TRUE16-NEXT: v_and_b16 v1.l, 0xff, v4.h -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v3.l, v5.l -; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v4 -; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v2.l, v0.h, v0.l -; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v2, v6, 0, 8 -; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v4, 24, v2 +; GFX11-DL-TRUE16-NEXT: v_lshrrev_b32_e32 v3, 24, v3 +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v2.l, v5.l +; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v0.h, v1.l, v0.l +; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v4, v4, 0, 8 +; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.h, v2.l, v0.l ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v2.l -; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v4.l, v0.l +; GFX11-DL-TRUE16-NEXT: v_mov_b16_e32 v1.l, v4.l +; GFX11-DL-TRUE16-NEXT: v_mad_u16 v0.l, v1.l, v3.l, v0.l ; GFX11-DL-TRUE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-DL-TRUE16-NEXT: v_bfe_i32 v0, v0, 0, 16 @@ -3475,25 +3473,24 @@ define amdgpu_kernel void @idot4_nonstandard_signed(ptr addrspace(1) %src1, ; GFX11-DL-FAKE16-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-DL-FAKE16-NEXT: global_load_b32 v0, v0, s[2:3] ; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(1) -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 8, v1 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v1, 0, 8 ; GFX11-DL-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 8, v0 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v1 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v6, v1, 0, 8 -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v2, v2, 0, 8 -; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v3 -; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v7, 0xff, v0 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1 -; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0 -; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_4) -; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v3, v2 +; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v3, 0xff, v0 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 8, v1 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 8, v0 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v1 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 +; GFX11-DL-FAKE16-NEXT: v_mul_lo_u16 v2, v2, v3 ; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v3, v4, 0, 8 ; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v4, 0xff, v5 -; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 -; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v7, v2 -; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 24, v1 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v5, v6, 0, 8 +; GFX11-DL-FAKE16-NEXT: v_and_b32_e32 v6, 0xff, v7 +; GFX11-DL-FAKE16-NEXT: v_lshrrev_b32_e32 v0, 24, v0 ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v4, v3, v2 +; GFX11-DL-FAKE16-NEXT: v_bfe_i32 v1, v1, 0, 8 +; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-DL-FAKE16-NEXT: v_mad_u16 v2, v6, v5, v2 ; GFX11-DL-FAKE16-NEXT: v_mad_u16 v0, v1, v0, v2 ; GFX11-DL-FAKE16-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-DL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index c03802e144d5b..dfc1e3c088129 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -1684,7 +1684,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 ; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-NEXT: v_add3_u32 v1, v1, v17, v2 +; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -1735,7 +1735,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 ; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-DL-NEXT: v_add3_u32 v1, v1, v17, v2 +; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1789,7 +1789,7 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 ; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v13, v3, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 1156f2718cf1e..3259ddf559d77 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -365,110 +365,107 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX8-NEXT: v_lshlrev_b32_e32 v1, 17, v0 -; GFX8-NEXT: v_and_b32_e32 v10, 0xfe000000, v1 +; GFX8-NEXT: v_and_b32_e32 v12, 0xfe000000, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 3 ; GFX8-NEXT: v_lshlrev_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 -; GFX8-NEXT: v_or_b32_e32 v0, v10, v0 +; GFX8-NEXT: v_or_b32_e32 v0, v12, v0 ; GFX8-NEXT: v_mov_b32_e32 v1, s35 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: s_movk_i32 s0, 0x2800 +; GFX8-NEXT: s_movk_i32 s0, 0x5000 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_mov_b32_e32 v6, 0 +; GFX8-NEXT: v_mov_b32_e32 v10, 0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v7, 0 -; GFX8-NEXT: v_mov_b32_e32 v11, 0x7f -; GFX8-NEXT: s_movk_i32 s1, 0x800 -; GFX8-NEXT: s_movk_i32 s2, 0x1000 -; GFX8-NEXT: s_movk_i32 s3, 0x1800 -; GFX8-NEXT: s_movk_i32 s4, 0x2000 +; GFX8-NEXT: v_mov_b32_e32 v11, 0 +; GFX8-NEXT: v_mov_b32_e32 v13, 0x7f ; GFX8-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX8-NEXT: ; =>This Loop Header: Depth=1 ; GFX8-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 -; GFX8-NEXT: s_mov_b32 s5, 0 +; GFX8-NEXT: s_mov_b32 s0, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: .LBB1_2: ; %for.body ; GFX8-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX8-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffd800, v2 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[2:3] -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[8:9] -; GFX8-NEXT: v_add_u32_e32 v12, vcc, 0xffffe000, v2 -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] -; GFX8-NEXT: v_add_u32_e32 v8, vcc, 0xffffe800, v2 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[8:9] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff000, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xfffff800, v2 +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffb000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffb800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[6:7] +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffc000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[4:5] +; GFX8-NEXT: v_add_u32_e32 v6, vcc, 0xffffc800, v2 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, 0xffffd000, v2 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v20, vcc, 0xffffd800, v2 ; GFX8-NEXT: v_addc_u32_e32 v21, vcc, -1, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[16:17] -; GFX8-NEXT: v_add_u32_e32 v16, vcc, s1, v2 -; GFX8-NEXT: v_addc_u32_e32 v17, vcc, 0, v3, vcc -; GFX8-NEXT: s_addk_i32 s5, 0x2000 -; GFX8-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v14, v6 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v15, v7, vcc -; GFX8-NEXT: v_add_u32_e32 v6, vcc, s2, v2 -; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[20:21] -; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, 0, v3, vcc -; GFX8-NEXT: v_add_u32_e32 v20, vcc, s3, v2 -; GFX8-NEXT: v_addc_u32_e32 v21, vcc, 0, v3, vcc -; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v22, vcc, v12, v22 -; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v13, v23, vcc -; GFX8-NEXT: v_add_u32_e32 v12, vcc, s4, v2 ; GFX8-NEXT: flat_load_dwordx2 v[6:7], v[6:7] -; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[20:21] -; GFX8-NEXT: v_addc_u32_e32 v13, vcc, 0, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v22, vcc, 0xffffe000, v2 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[8:9], v[4:5] +; GFX8-NEXT: flat_load_dwordx2 v[4:5], v[20:21] +; GFX8-NEXT: s_addk_i32 s0, 0x2000 +; GFX8-NEXT: s_cmp_gt_u32 s0, 0x3fffff ; GFX8-NEXT: s_waitcnt vmcnt(5) +; GFX8-NEXT: v_add_u32_e32 v24, vcc, v14, v10 +; GFX8-NEXT: v_addc_u32_e32 v25, vcc, v15, v11, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, 0xffffe800, v2 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, -1, v3, vcc +; GFX8-NEXT: v_add_u32_e32 v14, vcc, 0xfffff000, v2 +; GFX8-NEXT: flat_load_dwordx2 v[20:21], v[22:23] +; GFX8-NEXT: flat_load_dwordx2 v[10:11], v[10:11] +; GFX8-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(6) +; GFX8-NEXT: v_add_u32_e32 v22, vcc, v16, v24 +; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v17, v25, vcc +; GFX8-NEXT: v_add_u32_e32 v16, vcc, 0xfffff800, v2 +; GFX8-NEXT: flat_load_dwordx2 v[14:15], v[14:15] +; GFX8-NEXT: v_addc_u32_e32 v17, vcc, -1, v3, vcc +; GFX8-NEXT: flat_load_dwordx2 v[16:17], v[16:17] +; GFX8-NEXT: s_waitcnt vmcnt(7) ; GFX8-NEXT: v_add_u32_e32 v22, vcc, v18, v22 ; GFX8-NEXT: v_addc_u32_e32 v23, vcc, v19, v23, vcc -; GFX8-NEXT: v_add_u32_e32 v18, vcc, s0, v2 -; GFX8-NEXT: flat_load_dwordx2 v[12:13], v[12:13] -; GFX8-NEXT: v_addc_u32_e32 v19, vcc, 0, v3, vcc -; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[18:19] +; GFX8-NEXT: flat_load_dwordx2 v[18:19], v[2:3] ; GFX8-NEXT: v_add_u32_e32 v2, vcc, 0x10000, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; GFX8-NEXT: s_waitcnt vmcnt(7) +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v6, v22 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v7, v23, vcc ; GFX8-NEXT: s_waitcnt vmcnt(6) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v8, v22 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v9, v23, vcc +; GFX8-NEXT: v_add_u32_e32 v6, vcc, v8, v6 +; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v9, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(5) -; GFX8-NEXT: v_add_u32_e32 v8, vcc, v14, v8 -; GFX8-NEXT: v_addc_u32_e32 v9, vcc, v15, v9, vcc -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v8 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v9, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v4, v6 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v5, v7, vcc ; GFX8-NEXT: s_waitcnt vmcnt(4) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(3) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v6, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc -; GFX8-NEXT: s_waitcnt vmcnt(2) ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v20, v4 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v21, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(3) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v10, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v11, v5, vcc +; GFX8-NEXT: s_waitcnt vmcnt(2) +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v14, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v15, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_add_u32_e32 v4, vcc, v12, v4 -; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v4, vcc, v16, v4 +; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v17, v5, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u32_e32 v6, vcc, v18, v4 -; GFX8-NEXT: v_addc_u32_e32 v7, vcc, v19, v5, vcc +; GFX8-NEXT: v_add_u32_e32 v10, vcc, v18, v4 +; GFX8-NEXT: v_addc_u32_e32 v11, vcc, v19, v5, vcc ; GFX8-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX8-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX8-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; GFX8-NEXT: v_subrev_u32_e32 v11, vcc, 1, v11 +; GFX8-NEXT: v_subrev_u32_e32 v13, vcc, 1, v13 ; GFX8-NEXT: s_and_b64 vcc, exec, vcc ; GFX8-NEXT: s_cbranch_vccz .LBB1_1 ; GFX8-NEXT: ; %bb.4: ; %while.end ; GFX8-NEXT: v_mov_b32_e32 v1, s35 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v10 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s34, v12 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[6:7] +; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[10:11] ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: clmem_read: @@ -498,76 +495,79 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX900-NEXT: v_mov_b32_e32 v1, s35 ; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, s34, v0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x2800, v0 +; GFX900-NEXT: v_add_co_u32_e32 v0, vcc, 0x5000, v0 ; GFX900-NEXT: v_mov_b32_e32 v4, 0 ; GFX900-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX900-NEXT: v_mov_b32_e32 v5, 0 ; GFX900-NEXT: v_mov_b32_e32 v7, 0x7f -; GFX900-NEXT: s_movk_i32 s2, 0xf000 -; GFX900-NEXT: s_movk_i32 s3, 0x1000 -; GFX900-NEXT: s_movk_i32 s4, 0x2000 +; GFX900-NEXT: s_movk_i32 s2, 0xd000 +; GFX900-NEXT: s_movk_i32 s3, 0xe000 +; GFX900-NEXT: s_movk_i32 s4, 0xf000 ; GFX900-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX900-NEXT: ; =>This Loop Header: Depth=1 ; GFX900-NEXT: ; Child Loop BB1_2 Depth 2 ; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: s_mov_b32 s5, 0 +; GFX900-NEXT: v_mov_b32_e32 v2, v0 ; GFX900-NEXT: .LBB1_2: ; %for.body ; GFX900-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX900-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v2 +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffb000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[20:21], v[14:15], off +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, s2, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v3, vcc +; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, s3, v2 +; GFX900-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 +; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v3, vcc ; GFX900-NEXT: global_load_dwordx2 v[10:11], v[2:3], off offset:-4096 ; GFX900-NEXT: global_load_dwordx2 v[12:13], v[2:3], off offset:-2048 ; GFX900-NEXT: s_addk_i32 s5, 0x2000 ; GFX900-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v14, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v15, v5, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[8:9], off -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v14, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v15, v17, vcc -; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, s2, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v3, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[8:9], off offset:-2048 -; GFX900-NEXT: s_waitcnt vmcnt(0) -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v8, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v15, vcc, v9, v15, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[2:3], off offset:2048 -; GFX900-NEXT: v_add_co_u32_e32 v14, vcc, v10, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, v11, v15, vcc -; GFX900-NEXT: v_add_co_u32_e64 v14, s[0:1], v12, v14 -; GFX900-NEXT: v_addc_co_u32_e64 v15, s[0:1], v13, v11, s[0:1] -; GFX900-NEXT: v_add_co_u32_e32 v10, vcc, s3, v2 -; GFX900-NEXT: v_add_co_u32_e64 v12, s[0:1], s4, v2 -; GFX900-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v3, vcc -; GFX900-NEXT: v_addc_co_u32_e64 v13, vcc, 0, v3, s[0:1] -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v4, v14 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v5, v15, vcc -; GFX900-NEXT: global_load_dwordx2 v[4:5], v[12:13], off offset:-4096 -; GFX900-NEXT: global_load_dwordx2 v[14:15], v[10:11], off offset:2048 -; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v8, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v9, v17, vcc -; GFX900-NEXT: global_load_dwordx2 v[8:9], v[12:13], off -; GFX900-NEXT: global_load_dwordx2 v[10:11], v[12:13], off offset:2048 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e32 v22, vcc, v8, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX900-NEXT: global_load_dwordx2 v[8:9], v[14:15], off offset:-4096 +; GFX900-NEXT: s_waitcnt vmcnt(5) +; GFX900-NEXT: v_add_co_u32_e64 v24, s[0:1], v18, v22 +; GFX900-NEXT: v_addc_co_u32_e64 v25, s[0:1], v19, v5, s[0:1] +; GFX900-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX900-NEXT: global_load_dwordx2 v[22:23], v[14:15], off +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, s4, v2 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc +; GFX900-NEXT: global_load_dwordx2 v[4:5], v[4:5], off offset:-2048 +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_add_co_u32_e32 v20, vcc, v20, v24 +; GFX900-NEXT: global_load_dwordx2 v[14:15], v[2:3], off +; GFX900-NEXT: v_addc_co_u32_e32 v21, vcc, v21, v25, vcc ; GFX900-NEXT: v_add_co_u32_e32 v2, vcc, 0x10000, v2 ; GFX900-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc +; GFX900-NEXT: s_waitcnt vmcnt(7) +; GFX900-NEXT: v_add_co_u32_e32 v16, vcc, v16, v20 +; GFX900-NEXT: v_addc_co_u32_e32 v17, vcc, v17, v21, vcc +; GFX900-NEXT: s_waitcnt vmcnt(4) +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v8, v16 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v9, v17, vcc ; GFX900-NEXT: s_waitcnt vmcnt(3) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v16 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v17, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v18, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v19, v9, vcc ; GFX900-NEXT: s_waitcnt vmcnt(2) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v8, vcc, v22, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v9, vcc, v23, v9, vcc ; GFX900-NEXT: s_waitcnt vmcnt(1) -; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v4, v8 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v5, v9, vcc ; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 ; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc +; GFX900-NEXT: s_waitcnt vmcnt(0) +; GFX900-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX900-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc ; GFX900-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX900-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX900-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -610,7 +610,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6 ; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0 -; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0 +; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX10-NEXT: ; =>This Loop Header: Depth=1 @@ -621,30 +621,29 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: .LBB1_2: ; %for.body ; GFX10-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX10-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffe000 +; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffb800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xfffff000 +; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0xffffc800 ; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, -1, v5, vcc_lo -; GFX10-NEXT: s_clause 0x5 +; GFX10-NEXT: v_add_co_u32 v14, vcc_lo, v4, 0xffffd800 +; GFX10-NEXT: v_add_co_ci_u32_e32 v15, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v18, vcc_lo, v4, 0xffffe800 +; GFX10-NEXT: s_clause 0x2 ; GFX10-NEXT: global_load_dwordx2 v[12:13], v[8:9], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[14:15], v[8:9], off ; GFX10-NEXT: global_load_dwordx2 v[16:17], v[10:11], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[18:19], v[10:11], off -; GFX10-NEXT: global_load_dwordx2 v[20:21], v[4:5], off offset:-2048 -; GFX10-NEXT: global_load_dwordx2 v[22:23], v[4:5], off -; GFX10-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0x1000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: v_add_co_u32 v10, vcc_lo, v4, 0x2000 -; GFX10-NEXT: v_add_co_ci_u32_e32 v11, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: global_load_dwordx2 v[24:25], v[8:9], off offset:-2048 -; GFX10-NEXT: v_add_co_u32 v26, vcc_lo, 0x2800, v4 -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[28:29], v[10:11], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[20:21], v[14:15], off offset:-2048 +; GFX10-NEXT: v_add_co_ci_u32_e32 v19, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v23, vcc_lo, -1, v5, vcc_lo +; GFX10-NEXT: s_clause 0x7 +; GFX10-NEXT: global_load_dwordx2 v[24:25], v[18:19], off offset:-2048 ; GFX10-NEXT: global_load_dwordx2 v[8:9], v[8:9], off -; GFX10-NEXT: v_add_co_ci_u32_e32 v27, vcc_lo, 0, v5, vcc_lo -; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: global_load_dwordx2 v[30:31], v[10:11], off -; GFX10-NEXT: global_load_dwordx2 v[32:33], v[26:27], off +; GFX10-NEXT: global_load_dwordx2 v[10:11], v[10:11], off +; GFX10-NEXT: global_load_dwordx2 v[14:15], v[14:15], off +; GFX10-NEXT: global_load_dwordx2 v[26:27], v[18:19], off +; GFX10-NEXT: global_load_dwordx2 v[28:29], v[22:23], off +; GFX10-NEXT: global_load_dwordx2 v[30:31], v[4:5], off offset:-2048 +; GFX10-NEXT: global_load_dwordx2 v[32:33], v[4:5], off ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, 0, v5, vcc_lo ; GFX10-NEXT: s_addk_i32 s1, 0x2000 @@ -652,27 +651,25 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX10-NEXT: s_waitcnt vmcnt(10) ; GFX10-NEXT: v_add_co_u32 v2, s0, v12, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v13, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(9) -; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(8) +; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v17, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(7) -; GFX10-NEXT: v_add_co_u32 v2, s0, v18, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v19, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(6) +; GFX10-NEXT: s_waitcnt vmcnt(5) +; GFX10-NEXT: v_add_co_u32 v2, s0, v10, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v11, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v20, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v21, v3, s0 -; GFX10-NEXT: s_waitcnt vmcnt(5) -; GFX10-NEXT: v_add_co_u32 v2, s0, v22, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v23, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(4) +; GFX10-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v15, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v24, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v25, v3, s0 +; GFX10-NEXT: s_waitcnt vmcnt(3) +; GFX10-NEXT: v_add_co_u32 v2, s0, v26, v2 +; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v27, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) -; GFX10-NEXT: v_add_co_u32 v2, s0, v8, v2 -; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v9, v3, s0 ; GFX10-NEXT: v_add_co_u32 v2, s0, v28, v2 ; GFX10-NEXT: v_add_co_ci_u32_e64 v3, s0, v29, v3, s0 ; GFX10-NEXT: s_waitcnt vmcnt(1) @@ -720,76 +717,77 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_mov_b32_e32 v2, s35 ; GFX90A-NEXT: v_add_co_u32_e32 v1, vcc, s34, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v2, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x2800, v1 +; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f -; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 +; GFX90A-NEXT: s_movk_i32 s0, 0xd000 +; GFX90A-NEXT: s_movk_i32 s1, 0xe000 ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 -; GFX90A-NEXT: s_movk_i32 s3, 0x1000 -; GFX90A-NEXT: s_movk_i32 s4, 0x2000 ; GFX90A-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 ; GFX90A-NEXT: ; Child Loop BB1_2 Depth 2 +; GFX90A-NEXT: s_mov_b32 s3, 0 ; GFX90A-NEXT: v_pk_mov_b32 v[6:7], v[2:3], v[2:3] op_sel:[0,1] -; GFX90A-NEXT: s_mov_b32 s5, 0 ; GFX90A-NEXT: .LBB1_2: ; %for.body ; GFX90A-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 -; GFX90A-NEXT: v_add_co_u32_e64 v18, s[0:1], s3, v6 -; GFX90A-NEXT: v_addc_co_u32_e64 v19, s[0:1], 0, v7, s[0:1] -; GFX90A-NEXT: v_add_co_u32_e64 v20, s[0:1], s4, v6 -; GFX90A-NEXT: v_add_co_u32_e32 v8, vcc, 0xffffe000, v6 -; GFX90A-NEXT: v_addc_co_u32_e64 v21, s[0:1], 0, v7, s[0:1] -; GFX90A-NEXT: v_addc_co_u32_e32 v9, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[20:21], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[20:21], off -; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[8:9], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[8:9], off +; GFX90A-NEXT: v_add_co_u32_e32 v12, vcc, 0xffffb000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v13, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[12:13], off +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, 0xffffc000, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[14:15], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[14:15], off +; GFX90A-NEXT: v_add_co_u32_e32 v16, vcc, s0, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v17, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[16:17], off offset:-2048 +; GFX90A-NEXT: v_add_co_u32_e32 v14, vcc, s1, v6 +; GFX90A-NEXT: v_addc_co_u32_e32 v15, vcc, -1, v7, vcc +; GFX90A-NEXT: global_load_dwordx2 v[24:25], v[14:15], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[26:27], v[14:15], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[28:29], v[14:15], off ; GFX90A-NEXT: v_add_co_u32_e32 v22, vcc, s2, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v7, vcc -; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[22:23], off offset:-2048 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_load_dwordx2 v[18:19], v[18:19], off offset:2048 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_load_dwordx2 v[20:21], v[20:21], off offset:2048 -; GFX90A-NEXT: s_nop 0 -; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-4096 -; GFX90A-NEXT: global_load_dwordx2 v[12:13], v[6:7], off offset:-2048 -; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[6:7], off -; GFX90A-NEXT: global_load_dwordx2 v[16:17], v[6:7], off offset:2048 +; GFX90A-NEXT: global_load_dwordx2 v[14:15], v[22:23], off offset:-2048 +; GFX90A-NEXT: global_load_dwordx2 v[30:31], v[6:7], off +; GFX90A-NEXT: global_load_dwordx2 v[8:9], v[6:7], off offset:-4096 +; GFX90A-NEXT: global_load_dwordx2 v[10:11], v[6:7], off offset:-2048 ; GFX90A-NEXT: v_add_co_u32_e32 v6, vcc, 0x10000, v6 ; GFX90A-NEXT: v_addc_co_u32_e32 v7, vcc, 0, v7, vcc -; GFX90A-NEXT: s_addk_i32 s5, 0x2000 -; GFX90A-NEXT: s_cmp_gt_u32 s5, 0x3fffff -; GFX90A-NEXT: s_waitcnt vmcnt(8) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(7) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(6) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(3) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(2) +; GFX90A-NEXT: s_addk_i32 s3, 0x2000 +; GFX90A-NEXT: s_cmp_gt_u32 s3, 0x3fffff +; GFX90A-NEXT: s_waitcnt vmcnt(10) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v12, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v13, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(1) -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc -; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: s_waitcnt vmcnt(9) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(8) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(7) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v16, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v17, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(6) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v24, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v25, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v18, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v19, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(5) ; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v26, v4 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v27, v5, vcc -; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v20, v4 -; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v21, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(4) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v28, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v29, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(3) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v14, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v15, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(1) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v8, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v9, v5, vcc +; GFX90A-NEXT: s_waitcnt vmcnt(0) +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v10, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v11, v5, vcc +; GFX90A-NEXT: v_add_co_u32_e32 v4, vcc, v30, v4 +; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, v31, v5, vcc ; GFX90A-NEXT: s_cbranch_scc0 .LBB1_2 ; GFX90A-NEXT: ; %bb.3: ; %while.cond.loopexit ; GFX90A-NEXT: ; in Loop: Header=BB1_1 Depth=1 @@ -824,7 +822,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x2800, v0 +; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0 ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo ; GFX11-NEXT: .LBB1_1: ; %for.cond.preheader ; GFX11-NEXT: ; =>This Loop Header: Depth=1 @@ -836,74 +834,76 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX11-NEXT: ; Parent Loop BB1_1 Depth=1 ; GFX11-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, 0xffffe000, v4 +; GFX11-NEXT: v_add_co_u32 v8, vcc_lo, v4, 0xffffc000 ; GFX11-NEXT: v_add_co_ci_u32_e64 v9, null, -1, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xfffff000, v4 +; GFX11-NEXT: v_add_co_u32 v10, vcc_lo, 0xffffc000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v11, null, -1, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[12:13], v[8:9], off offset:-2048 -; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, v4, 0x2000 -; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, 0, v5, vcc_lo -; GFX11-NEXT: v_add_co_u32 v24, vcc_lo, 0x1000, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_co_ci_u32_e64 v25, null, 0, v5, vcc_lo -; GFX11-NEXT: global_load_b64 v[26:27], v[22:23], off offset:-4096 -; GFX11-NEXT: v_add_co_u32 v28, vcc_lo, 0x2000, v4 -; GFX11-NEXT: s_clause 0x6 -; GFX11-NEXT: global_load_b64 v[24:25], v[24:25], off offset:2048 -; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: global_load_b64 v[14:15], v[8:9], off offset:-4096 +; GFX11-NEXT: v_add_co_u32 v12, vcc_lo, 0xffffd000, v4 +; GFX11-NEXT: v_add_co_ci_u32_e64 v13, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v16, vcc_lo, v4, 0xffffe000 ; GFX11-NEXT: global_load_b64 v[10:11], v[10:11], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[14:15], v[4:5], off offset:-4096 -; GFX11-NEXT: global_load_b64 v[16:17], v[4:5], off offset:-2048 -; GFX11-NEXT: global_load_b64 v[18:19], v[4:5], off -; GFX11-NEXT: global_load_b64 v[20:21], v[4:5], off offset:2048 -; GFX11-NEXT: v_add_co_ci_u32_e64 v29, null, 0, v5, vcc_lo +; GFX11-NEXT: v_add_co_ci_u32_e64 v17, null, -1, v5, vcc_lo +; GFX11-NEXT: global_load_b64 v[12:13], v[12:13], off offset:-2048 +; GFX11-NEXT: v_add_co_u32 v18, vcc_lo, 0xffffe000, v4 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off -; GFX11-NEXT: global_load_b64 v[28:29], v[28:29], off offset:2048 +; GFX11-NEXT: global_load_b64 v[20:21], v[16:17], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[8:9], v[8:9], off +; GFX11-NEXT: v_add_co_ci_u32_e64 v19, null, -1, v5, vcc_lo +; GFX11-NEXT: v_add_co_u32 v22, vcc_lo, 0xfffff000, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_add_co_ci_u32_e64 v23, null, -1, v5, vcc_lo +; GFX11-NEXT: s_clause 0x5 +; GFX11-NEXT: global_load_b64 v[18:19], v[18:19], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[16:17], v[16:17], off +; GFX11-NEXT: global_load_b64 v[22:23], v[22:23], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[24:25], v[4:5], off offset:-4096 +; GFX11-NEXT: global_load_b64 v[26:27], v[4:5], off offset:-2048 +; GFX11-NEXT: global_load_b64 v[28:29], v[4:5], off ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, 0x10000, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, 0, v5, vcc_lo ; GFX11-NEXT: s_addk_i32 s1, 0x2000 ; GFX11-NEXT: s_cmp_gt_u32 s1, 0x3fffff ; GFX11-NEXT: s_waitcnt vmcnt(10) -; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0 -; GFX11-NEXT: s_waitcnt vmcnt(7) -; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0 -; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0 +; GFX11-NEXT: s_waitcnt vmcnt(9) ; GFX11-NEXT: v_add_co_u32 v2, s0, v10, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v11, v3, s0 +; GFX11-NEXT: s_waitcnt vmcnt(6) +; GFX11-NEXT: v_add_co_u32 v2, s0, v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v9, v3, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v12, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v13, v3, s0 +; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(5) -; GFX11-NEXT: v_add_co_u32 v2, s0, v14, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v15, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_add_co_u32 v2, s0, v16, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v17, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_add_co_u32 v2, s0, v18, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v19, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_add_co_u32 v2, s0, v20, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v21, v3, s0 -; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0 ; GFX11-NEXT: v_add_co_u32 v2, s0, v24, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v25, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_add_co_u32 v2, s0, v22, v2 +; GFX11-NEXT: v_add_co_u32 v2, s0, v26, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v23, v3, s0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, v27, v3, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v28, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index c0c1763d54cc0..d6d639c5e496b 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -146,11 +146,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.276, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] @@ -169,11 +169,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.285, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.207, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.223, addrspace 4) @@ -190,22 +190,26 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.296, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.259, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) + ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 + ; CHECK-NEXT: KILL undef %469:sreg_64 + ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.268, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.308, align 8, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM22]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 %73.sub0, 65535, implicit-def dead $scc @@ -221,17 +225,17 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.326, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.332, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.338, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index f6922c75ff848..30f52773bbce5 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -7,27 +7,36 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vmcnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v3, v1, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -48,20 +57,22 @@ bb: define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vscnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc -; GFX8-NEXT: flat_store_dword v[1:2], v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vscnt_global: @@ -70,14 +81,18 @@ define amdgpu_kernel void @barrier_vscnt_global(ptr addrspace(1) %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -100,19 +115,22 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc -; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: flat_store_dword v[1:2], v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 @@ -124,15 +142,19 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_global(ptr addrspace(1) %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX9-NEXT: global_load_dword v0, v[2:3], off offset:-8 -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: global_load_dword v3, v2, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[2:3], v0, off offset:-4 +; GFX9-NEXT: global_store_dword v[0:1], v3, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -157,30 +179,38 @@ bb: define amdgpu_kernel void @barrier_vmcnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vmcnt_flat: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_load_dword v2, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4 +; GFX9-NEXT: flat_store_dword v[0:1], v4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -201,20 +231,22 @@ bb: define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vscnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc -; GFX8-NEXT: flat_store_dword v[1:2], v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 1 +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v3, 1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: barrier_vscnt_flat: @@ -223,16 +255,18 @@ define amdgpu_kernel void @barrier_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 1 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v3, 1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v2 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -255,19 +289,22 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc -; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: flat_store_dword v[1:2], v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 @@ -279,18 +316,21 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -315,19 +355,22 @@ bb: define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX8-LABEL: barrier_vmcnt_vscnt_flat_workgroup: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, 2, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_lshrrev_b64 v[1:2], 30, v[0:1] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_addc_u32_e32 v2, vcc, v3, v2, vcc -; GFX8-NEXT: v_add_u32_e32 v3, vcc, -8, v1 -; GFX8-NEXT: v_addc_u32_e32 v4, vcc, -1, v2, vcc -; GFX8-NEXT: flat_load_dword v3, v[3:4] -; GFX8-NEXT: flat_store_dword v[1:2], v0 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, -4, v1 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, -1, v2, vcc +; GFX8-NEXT: v_mov_b32_e32 v4, s1 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v4, v3, vcc +; GFX8-NEXT: flat_store_dword v[2:3], v1 +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX8-NEXT: flat_load_dword v3, v[2:3] +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v4, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: s_barrier ; GFX8-NEXT: flat_store_dword v[0:1], v3 @@ -339,18 +382,21 @@ define amdgpu_kernel void @barrier_vmcnt_vscnt_flat_workgroup(ptr %arg) { ; GFX9-NEXT: v_add_u32_e32 v2, 2, v0 ; GFX9-NEXT: v_lshrrev_b64 v[2:3], 30, v[1:2] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, s1 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v0, v3, vcc -; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, -8, v2 -; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, -1, v3, vcc -; GFX9-NEXT: flat_load_dword v4, v[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, -4, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v3, vcc ; GFX9-NEXT: flat_store_dword v[2:3], v1 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v3, vcc +; GFX9-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v2 +; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v4, vcc +; GFX9-NEXT: flat_load_dword v3, v[2:3] +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v4, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: flat_store_dword v[0:1], v4 +; GFX9-NEXT: flat_store_dword v[0:1], v3 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -375,25 +421,34 @@ bb: define amdgpu_kernel void @load_vmcnt_global(ptr addrspace(1) %arg) { ; GFX8-LABEL: load_vmcnt_global: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: load_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[0:1] offset:4 +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -411,28 +466,36 @@ bb: define amdgpu_kernel void @load_vmcnt_flat(ptr %arg) { ; GFX8-LABEL: load_vmcnt_flat: ; GFX8: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX8-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX8-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v1, s1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s0, v1 +; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX8-NEXT: flat_load_dword v4, v[1:2] +; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, 1, v0 +; GFX8-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_dword v2, v[0:1] -; GFX8-NEXT: v_add_u32_e32 v0, vcc, 4, v0 -; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; GFX8-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: flat_store_dword v[0:1], v2 +; GFX8-NEXT: flat_store_dword v[0:1], v4 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: load_vmcnt_flat: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 +; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, s0, v1 +; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc +; GFX9-NEXT: flat_load_dword v4, v[1:2] +; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc -; GFX9-NEXT: flat_load_dword v2, v[0:1] +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: flat_store_dword v[0:1], v2 offset:4 +; GFX9-NEXT: flat_store_dword v[0:1], v4 ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll index 51b42463d81ba..0d3846d1eac16 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll @@ -13,7 +13,7 @@ entry: %tmp = sext i32 undef to i64 %arrayidx114 = getelementptr inbounds %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1 -; CHECK: %tmp1 = getelementptr i8, ptr addrspace(1) %arrayidx114, i64 4 +; CHECK: %tmp1 = getelementptr %struct.Matrix4x4, ptr addrspace(1) %leafTransformations, i64 %tmp, i32 0, i64 0, i64 1 %tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4 ret void } diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll index 8662e61c95d66..03edfdceab325 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll @@ -46,9 +46,9 @@ define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_off ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP]] ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(1) [[P1]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[I]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[OFFSET]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[P1]], i64 [[TMP5]] +; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]] +; CHECK-NEXT: [[TMP5:%.*]] = sext i32 [[J2]] to i64 +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(1) [[ARR]], i64 [[TMP5]] ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(1) [[P2]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void @@ -109,8 +109,8 @@ define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(pt ; CHECK-NEXT: [[P1:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J1]] ; CHECK-NEXT: [[V11:%.*]] = load i32, ptr addrspace(3) [[P1]], align 4 ; CHECK-NEXT: store i32 [[V11]], ptr addrspace(1) [[OUT]], align 4 -; CHECK-NEXT: [[J2:%.*]] = shl i32 [[I]], 2 -; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[P1]], i32 [[J2]] +; CHECK-NEXT: [[J2:%.*]] = add i32 [[J1]], [[I]] +; CHECK-NEXT: [[P2:%.*]] = getelementptr inbounds float, ptr addrspace(3) [[ARR]], i32 [[J2]] ; CHECK-NEXT: [[V22:%.*]] = load i32, ptr addrspace(3) [[P2]], align 4 ; CHECK-NEXT: store i32 [[V22]], ptr addrspace(1) [[OUT]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll deleted file mode 100644 index b28ca07084495..0000000000000 --- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-i8-gep.ll +++ /dev/null @@ -1,271 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=slsr -S | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX - -target triple = "nvptx64-nvidia-cuda" - -define void @slsr_i8_zero_delta(ptr %in, ptr %out, i64 %add) { -; PTX-LABEL: slsr_i8_zero_delta( -; PTX: { -; PTX-NEXT: .reg .b16 %rs<6>; -; PTX-NEXT: .reg .b64 %rd<5>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_param_2]; -; PTX-NEXT: add.s64 %rd3, %rd1, %rd2; -; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_param_1]; -; PTX-NEXT: ld.b8 %rs1, [%rd3+32]; -; PTX-NEXT: ld.b8 %rs2, [%rd3+64]; -; PTX-NEXT: ld.b8 %rs3, [%rd3+96]; -; PTX-NEXT: add.s16 %rs4, %rs1, %rs2; -; PTX-NEXT: add.s16 %rs5, %rs4, %rs3; -; PTX-NEXT: st.b8 [%rd4], %rs5; -; PTX-NEXT: ret; -; CHECK-LABEL: define void @slsr_i8_zero_delta( -; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) { -; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32 -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1 -; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 64 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1 -; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 96 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1 -; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]] -; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]] -; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1 -; CHECK-NEXT: ret void -; - %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 - %load0 = load i8, ptr %getElem0.1 - - %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 - %load1 = load i8, ptr %getElem1.1 - - %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 - %load2 = load i8, ptr %getElem2.1 - - %out0 = add i8 %load0, %load1 - %out1 = add i8 %out0, %load2 - store i8 %out1, ptr %out - - ret void -} - -define void @slsr_i8_zero_delta_2(ptr %in, ptr %out, i64 %add) { -; PTX-LABEL: slsr_i8_zero_delta_2( -; PTX: { -; PTX-NEXT: .reg .b16 %rs<6>; -; PTX-NEXT: .reg .b64 %rd<5>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_zero_delta_2_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_zero_delta_2_param_2]; -; PTX-NEXT: add.s64 %rd3, %rd1, %rd2; -; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_zero_delta_2_param_1]; -; PTX-NEXT: ld.b8 %rs1, [%rd3]; -; PTX-NEXT: ld.b8 %rs2, [%rd3+32]; -; PTX-NEXT: ld.b8 %rs3, [%rd3+64]; -; PTX-NEXT: add.s16 %rs4, %rs1, %rs2; -; PTX-NEXT: add.s16 %rs5, %rs4, %rs3; -; PTX-NEXT: st.b8 [%rd4], %rs5; -; PTX-NEXT: ret; -; CHECK-LABEL: define void @slsr_i8_zero_delta_2( -; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) { -; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_0]], align 1 -; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1 -; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 64 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1 -; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]] -; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]] -; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1 -; CHECK-NEXT: ret void -; - %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add - %load0 = load i8, ptr %getElem0.0 - - %getElem1.0 = getelementptr i8, ptr %in, i64 %add - %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 32 - %load1 = load i8, ptr %getElem1.1 - - %getElem2.0 = getelementptr i8, ptr %in, i64 %add - %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 64 - %load2 = load i8, ptr %getElem2.1 - - %out0 = add i8 %load0, %load1 - %out1 = add i8 %out0, %load2 - store i8 %out1, ptr %out - - ret void -} - -define void @slsr_i8_base_delta(ptr %in, ptr %out, i64 %add) { -; PTX-LABEL: slsr_i8_base_delta( -; PTX: { -; PTX-NEXT: .reg .b16 %rs<6>; -; PTX-NEXT: .reg .b64 %rd<5>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_base_delta_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_base_delta_param_2]; -; PTX-NEXT: add.s64 %rd3, %rd1, %rd2; -; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_base_delta_param_1]; -; PTX-NEXT: ld.b8 %rs1, [%rd3+32]; -; PTX-NEXT: ld.b8 %rs2, [%rd3+65]; -; PTX-NEXT: ld.b8 %rs3, [%rd3+98]; -; PTX-NEXT: add.s16 %rs4, %rs1, %rs2; -; PTX-NEXT: add.s16 %rs5, %rs4, %rs3; -; PTX-NEXT: st.b8 [%rd4], %rs5; -; PTX-NEXT: ret; -; CHECK-LABEL: define void @slsr_i8_base_delta( -; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) { -; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32 -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1 -; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 1 -; CHECK-NEXT: [[GETELEM1_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_1]], i64 64 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_2]], align 1 -; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 2 -; CHECK-NEXT: [[GETELEM2_2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_1]], i64 96 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_2]], align 1 -; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]] -; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]] -; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1 -; CHECK-NEXT: ret void -; - %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 - %load0 = load i8, ptr %getElem0.1 - - %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 1 - %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 %add - %getElem1.2 = getelementptr inbounds i8, ptr %getElem1.1, i64 64 - %load1 = load i8, ptr %getElem1.2 - - %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 2 - %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 %add - %getElem2.2 = getelementptr inbounds i8, ptr %getElem2.1, i64 96 - %load2 = load i8, ptr %getElem2.2 - - %out0 = add i8 %load0, %load1 - %out1 = add i8 %out0, %load2 - store i8 %out1, ptr %out - - ret void -} - -define void @slsr_i8_index_delta(ptr %in, ptr %out, i64 %add) { -; PTX-LABEL: slsr_i8_index_delta( -; PTX: { -; PTX-NEXT: .reg .b16 %rs<6>; -; PTX-NEXT: .reg .b64 %rd<7>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_index_delta_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_index_delta_param_2]; -; PTX-NEXT: shl.b64 %rd3, %rd2, 3; -; PTX-NEXT: add.s64 %rd4, %rd1, %rd3; -; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_index_delta_param_1]; -; PTX-NEXT: ld.b8 %rs1, [%rd4+32]; -; PTX-NEXT: add.s64 %rd6, %rd1, %rd2; -; PTX-NEXT: ld.b8 %rs2, [%rd6+64]; -; PTX-NEXT: ld.b8 %rs3, [%rd6+96]; -; PTX-NEXT: add.s16 %rs4, %rs1, %rs2; -; PTX-NEXT: add.s16 %rs5, %rs4, %rs3; -; PTX-NEXT: st.b8 [%rd5], %rs5; -; PTX-NEXT: ret; -; CHECK-LABEL: define void @slsr_i8_index_delta( -; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]]) { -; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds double, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32 -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1 -; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1 -; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 96 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1 -; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]] -; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]] -; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1 -; CHECK-NEXT: ret void -; - %getElem0.0 = getelementptr inbounds double, ptr %in, i64 %add - %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 - %load0 = load i8, ptr %getElem0.1 - - %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 - %load1 = load i8, ptr %getElem1.1 - - %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 - %load2 = load i8, ptr %getElem2.1 - - %out0 = add i8 %load0, %load1 - %out1 = add i8 %out0, %load2 - store i8 %out1, ptr %out - - ret void -} - -define void @slsr_i8_stride_delta(ptr %in, ptr %out, i64 %add, i64 %offset) { -; PTX-LABEL: slsr_i8_stride_delta( -; PTX: { -; PTX-NEXT: .reg .b16 %rs<6>; -; PTX-NEXT: .reg .b64 %rd<7>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [slsr_i8_stride_delta_param_0]; -; PTX-NEXT: ld.param.b64 %rd2, [slsr_i8_stride_delta_param_2]; -; PTX-NEXT: add.s64 %rd3, %rd1, %rd2; -; PTX-NEXT: ld.param.b64 %rd4, [slsr_i8_stride_delta_param_1]; -; PTX-NEXT: ld.b8 %rs1, [%rd3+32]; -; PTX-NEXT: ld.param.b64 %rd5, [slsr_i8_stride_delta_param_3]; -; PTX-NEXT: ld.b8 %rs2, [%rd3+65]; -; PTX-NEXT: add.s64 %rd6, %rd3, %rd5; -; PTX-NEXT: ld.b8 %rs3, [%rd6+96]; -; PTX-NEXT: add.s16 %rs4, %rs1, %rs2; -; PTX-NEXT: add.s16 %rs5, %rs4, %rs3; -; PTX-NEXT: st.b8 [%rd4], %rs5; -; PTX-NEXT: ret; -; CHECK-LABEL: define void @slsr_i8_stride_delta( -; CHECK-SAME: ptr [[IN:%.*]], ptr [[OUT:%.*]], i64 [[ADD:%.*]], i64 [[OFFSET:%.*]]) { -; CHECK-NEXT: [[GETELEM0_0:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 [[ADD]] -; CHECK-NEXT: [[GETELEM0_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 32 -; CHECK-NEXT: [[LOAD0:%.*]] = load i8, ptr [[GETELEM0_1]], align 1 -; CHECK-NEXT: [[GETELEM1_0:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 1 -; CHECK-NEXT: [[GETELEM1_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1_0]], i64 64 -; CHECK-NEXT: [[LOAD1:%.*]] = load i8, ptr [[GETELEM1_1]], align 1 -; CHECK-NEXT: [[GETELEM2_0:%.*]] = getelementptr inbounds i8, ptr [[GETELEM0_0]], i64 [[OFFSET]] -; CHECK-NEXT: [[GETELEM2_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM2_0]], i64 96 -; CHECK-NEXT: [[LOAD2:%.*]] = load i8, ptr [[GETELEM2_1]], align 1 -; CHECK-NEXT: [[OUT0:%.*]] = add i8 [[LOAD0]], [[LOAD1]] -; CHECK-NEXT: [[OUT1:%.*]] = add i8 [[OUT0]], [[LOAD2]] -; CHECK-NEXT: store i8 [[OUT1]], ptr [[OUT]], align 1 -; CHECK-NEXT: ret void -; - %getElem0.0 = getelementptr inbounds i8, ptr %in, i64 %add - %getElem0.1 = getelementptr inbounds i8, ptr %getElem0.0, i64 32 - %load0 = load i8, ptr %getElem0.1 - - %add1 = add i64 %add, 1 - %getElem1.0 = getelementptr inbounds i8, ptr %in, i64 %add1 - %getElem1.1 = getelementptr inbounds i8, ptr %getElem1.0, i64 64 - %load1 = load i8, ptr %getElem1.1 - - %add2 = add i64 %add, %offset - %getElem2.0 = getelementptr inbounds i8, ptr %in, i64 %add2 - %getElem2.1 = getelementptr inbounds i8, ptr %getElem2.0, i64 96 - %load2 = load i8, ptr %getElem2.1 - - %out0 = add i8 %load0, %load1 - %out1 = add i8 %out0, %load2 - store i8 %out1, ptr %out - - ret void -} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll b/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll deleted file mode 100644 index ce6f720cc4352..0000000000000 --- a/llvm/test/Transforms/StraightLineStrengthReduce/NVPTX/slsr-var-delta.ll +++ /dev/null @@ -1,70 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: opt < %s -passes=slsr -S | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_75 | FileCheck %s --check-prefix=PTX - -target triple = "nvptx64-nvidia-cuda" - -; Test SLSR can reuse the computation by complex variable delta. -; The original program needs 4 mul.wide.s32, after SLSR with -; variable-delta, it can reduce to 1 mul.wide.s32. -define void @foo(ptr %a, ptr %b, i32 %j) { -; PTX-LABEL: foo( -; PTX: { -; PTX-NEXT: .reg .b32 %r<4>; -; PTX-NEXT: .reg .b64 %rd<9>; -; PTX-EMPTY: -; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.b64 %rd1, [foo_param_0]; -; PTX-NEXT: ld.b32 %r1, [%rd1]; -; PTX-NEXT: ld.param.b64 %rd2, [foo_param_1]; -; PTX-NEXT: ld.param.b32 %r2, [foo_param_2]; -; PTX-NEXT: add.s32 %r3, %r1, %r2; -; PTX-NEXT: mul.wide.s32 %rd3, %r3, 4; -; PTX-NEXT: add.s64 %rd4, %rd2, %rd3; -; PTX-NEXT: st.b32 [%rd4], 0; -; PTX-NEXT: add.s64 %rd5, %rd4, %rd3; -; PTX-NEXT: st.b32 [%rd5], 1; -; PTX-NEXT: add.s64 %rd6, %rd5, 4; -; PTX-NEXT: st.b32 [%rd5+4], 2; -; PTX-NEXT: add.s64 %rd7, %rd6, %rd3; -; PTX-NEXT: st.b32 [%rd7], 3; -; PTX-NEXT: add.s64 %rd8, %rd7, %rd3; -; PTX-NEXT: st.b32 [%rd8], 4; -; PTX-NEXT: ret; - %i.0 = load i32, ptr %a, align 8 - %i = add i32 %i.0, %j - ; CHECK: [[L:%.*]] = load i32, ptr %a, align 8 - ; CHECK: [[I:%.*]] = add i32 [[L]], %j - %gep.24 = getelementptr float, ptr %b, i32 %i - ; CHECK: [[GEP0:%.*]] = getelementptr float, ptr %b, i32 [[I]] - ; CHECK: store i32 0, ptr [[GEP0]] - store i32 0, ptr %gep.24 - %gep.24.sum1 = add i32 %i, %i - %gep.25 = getelementptr float, ptr %b, i32 %gep.24.sum1 - ; CHECK: [[EXT1:%.*]] = sext i32 [[I]] to i64 - ; CHECK: [[MUL1:%.*]] = shl i64 [[EXT1]], 2 - ; CHECK: [[GEP1:%.*]] = getelementptr i8, ptr [[GEP0]], i64 [[MUL1]] - ; CHECK: store i32 1, ptr [[GEP1]] - store i32 1, ptr %gep.25 - %gep.26.sum3 = add i32 1, %i - %gep.27.sum = add i32 %gep.26.sum3, %i - %gep.28 = getelementptr float, ptr %b, i32 %gep.27.sum - ; CHECK: [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4 - ; CHECK: store i32 2, ptr [[GEP2]] - store i32 2, ptr %gep.28 - %gep.28.sum = add i32 %gep.27.sum, %i - %gep.29 = getelementptr float, ptr %b, i32 %gep.28.sum - ; CHECK: [[EXT2:%.*]] = sext i32 [[I]] to i64 - ; CHECK: [[MUL2:%.*]] = shl i64 [[EXT2]], 2 - ; CHECK: [[GEP3:%.*]] = getelementptr i8, ptr [[GEP2]], i64 [[MUL2]] - ; CHECK: store i32 3, ptr [[GEP3]] - store i32 3, ptr %gep.29 - %gep.29.sum = add i32 %gep.28.sum, %i - %gep.30 = getelementptr float, ptr %b, i32 %gep.29.sum - ; CHECK: [[EXT3:%.*]] = sext i32 [[I]] to i64 - ; CHECK: [[MUL3:%.*]] = shl i64 [[EXT3]], 2 - ; CHECK: [[GEP4:%.*]] = getelementptr i8, ptr [[GEP3]], i64 [[MUL3]] - ; CHECK: store i32 4, ptr [[GEP4]] - store i32 4, ptr %gep.30 - ret void -} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll b/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll deleted file mode 100644 index 4e8ae40f039aa..0000000000000 --- a/llvm/test/Transforms/StraightLineStrengthReduce/path-compression.ll +++ /dev/null @@ -1,35 +0,0 @@ -; RUN: opt < %s -passes="slsr" -S | FileCheck %s - -target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" - -%struct.B = type { i16 } -%struct.A = type { %struct.B, %struct.B, %struct.B } - -define void @path_compression(i32 %a, ptr %base, i16 %r, i1 %cond) { -; CHECK-LABEL: @path_compression( -; CHECK: [[I:%.*]] = sext i32 %a to i64 -; CHECK: [[GEP1:%.*]] = getelementptr inbounds %struct.A, ptr %base, i64 [[I]] -; CHECK: br -; CHECK-LABEL: next -; compress the path to use GEP1 as the Basis instead of GEP2 -; CHECK: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2 -; CHECK: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 4 - - - %1 = sext i32 %a to i64 - %2 = add i64 %1, 1 - %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 - br i1 %cond, label %next, label %ret - -next: - %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1 - %offset = sub i64 %2, 1 - %getElem3 = getelementptr inbounds %struct.A, ptr %base, i64 %offset, i32 2 - store i16 %r, ptr %getElem1, align 2 - store i16 %r, ptr %getElem2, align 2 - store i16 %r, ptr %getElem3, align 2 - br label %ret - -ret: - ret void -} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll b/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll deleted file mode 100644 index 7fa0e4f9474f1..0000000000000 --- a/llvm/test/Transforms/StraightLineStrengthReduce/pick-candidate.ll +++ /dev/null @@ -1,32 +0,0 @@ -; RUN: opt < %s -passes="slsr" -S | FileCheck %s - -target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" - -%struct.B = type { i16 } -%struct.A = type { %struct.B, %struct.B } - -define i32 @pick(i32 %0, ptr %addr) { -; `d` can be optimized by 2 approaches -; 1. a = 1 + 1 * %0 -; d = 1 + 8 * %0 -; = a + 7 * %0 -; 2. c = (8 * %0) + 3 -; d = (8 * %0) + 1 -; = c - 2 -; Pick candidate (2) as it can save 1 instruction from (7 * %0) -; -; CHECK-LABEL: pick -; CHECK: [[A:%.*]] = add i32 %0, 1 -; CHECK: [[B:%.*]] = shl i32 %0, 3 -; CHECK: [[C:%.*]] = add i32 [[B]], 3 -; CHECK: store i32 [[C]], ptr %addr -; CHECK: [[D:%.*]] = add i32 [[C]], -2 -; CHECK: ret i32 %d - - %a = add i32 %0, 1 - %b = shl i32 %0, 3 - %c = add i32 %b, 3 - store i32 %c, ptr %addr - %d = add i32 %b, 1 - ret i32 %d -} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll index 1abf2fa331821..d85331f77b12f 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-add.ll @@ -4,8 +4,6 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64" -; Index Delta - define void @shl(i32 %b, i32 %s) { ; CHECK-LABEL: @shl( ; CHECK-NEXT: [[T1:%.*]] = add i32 [[B:%.*]], [[S:%.*]] @@ -173,121 +171,3 @@ define void @slsr_strided_add_128bit(i128 %b, i128 %s) { declare void @foo(i32) declare void @voo(<2 x i32>) declare void @bar(i128) - -; Stride Delta - -define void @stride_const(i32 %a, ptr %base, i16 %r) { -; Reuse add1 to compute add2 -; CHECK-LABEL: @stride_const( -; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2 -; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], 8 -; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr -; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr -; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2 -; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2 -; CHECK-NEXT: ret void -; - %1 = sext i32 %a to i64 - %2 = mul i64 %1, 2 - %3 = add i64 %1, 4 - %4 = mul i64 %3, 2 - %baseInt = ptrtoint ptr %base to i64 - %add1 = add i64 %baseInt, %2 - %add2 = add i64 %baseInt, %4 - %addr1 = inttoptr i64 %add1 to ptr - %addr2 = inttoptr i64 %add2 to ptr - store i16 %r, ptr %addr1, align 2 - store i16 %r, ptr %addr2, align 2 - ret void -} - -define void @stride_var(i32 %a, ptr %base, i16 %r, i64 %n) { -; Reuse add1 to compute add2 to save a add.s64 -; CHECK-LABEL: @stride_var( -; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2 -; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] -; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[N:%.*]], 1 -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], [[TMP3]] -; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr -; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr -; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2 -; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2 -; CHECK-NEXT: ret void -; - - %1 = sext i32 %a to i64 - %2 = mul i64 %1, 2 - %3 = add i64 %1, %n - %4 = mul i64 %3, 2 - %baseInt = ptrtoint ptr %base to i64 - %add1 = add i64 %baseInt, %2 - %add2 = add i64 %baseInt, %4 - %addr1 = inttoptr i64 %add1 to ptr - %addr2 = inttoptr i64 %add2 to ptr - store i16 %r, ptr %addr1, align 2 - store i16 %r, ptr %addr2, align 2 - ret void -} - -; Base Delta - -define void @base_const(i32 %a, ptr %base, i16 %r) { -; Reuse add1 to compute add2 -; CHECK-LABEL: @base_const( -; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2 -; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], 5 -; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr -; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr -; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2 -; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2 -; CHECK-NEXT: ret void -; - - %1 = sext i32 %a to i64 - %2 = mul i64 %1, 2 - %baseInt = ptrtoint ptr %base to i64 - %add1 = add i64 %baseInt, %2 - %add2.0 = add i64 %baseInt, 5 - %add2 = add i64 %add2.0, %2 - %addr1 = inttoptr i64 %add1 to ptr - %addr2 = inttoptr i64 %add2 to ptr - store i16 %r, ptr %addr1, align 2 - store i16 %r, ptr %addr2, align 2 - ret void -} - -define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) { -; Reuse add1 to compute add2 -; CHECK-LABEL: @base_var( -; CHECK-NEXT: [[I1:%.*]] = sext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[I2:%.*]] = mul i64 [[I1]], 2 -; CHECK-NEXT: [[BI:%.*]] = ptrtoint ptr [[BASE:%.*]] to i64 -; CHECK-NEXT: [[ADD1:%.*]] = add i64 [[BI]], [[I2]] -; CHECK-NEXT: [[ADD2:%.*]] = add i64 [[ADD1]], [[N:%.*]] -; CHECK-NEXT: [[ADDR1:%.*]] = inttoptr i64 [[ADD1]] to ptr -; CHECK-NEXT: [[ADDR2:%.*]] = inttoptr i64 [[ADD2]] to ptr -; CHECK-NEXT: store i16 [[R:%.*]], ptr [[ADDR1]], align 2 -; CHECK-NEXT: store i16 [[R]], ptr [[ADDR2]], align 2 -; CHECK-NEXT: ret void -; - - %1 = sext i32 %a to i64 - %2 = mul i64 %1, 2 - %baseInt = ptrtoint ptr %base to i64 - %add1 = add i64 %baseInt, %2 - %add2.0 = add i64 %baseInt, %n - %add2 = add i64 %add2.0, %2 - %addr1 = inttoptr i64 %add1 to ptr - %addr2 = inttoptr i64 %add2 to ptr - store i16 %r, ptr %addr1, align 2 - store i16 %r, ptr %addr2, align 2 - ret void -} diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll index ee99c6bbeb50a..7cd45329c24fe 100644 --- a/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll +++ b/llvm/test/Transforms/StraightLineStrengthReduce/slsr-gep.ll @@ -3,43 +3,6 @@ target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64-p:64:64:64-p1:32:32:32-p2:128:128:128:32" -; Index Delta -; Most of the original test cases in this file were optimized by Index-delta. -; After adding Base-delta and Stride-delta, most of the GEP test cases -; are optimized by Stride-delta now. The only case that GEP needs index-delta -; SLSR is to reuse address computation from a GEP with different pointee type. -; Once LLVM completely moves from typed GEP to PtrAdd, we can remove -; index-delta for GEP/PtrAdd. - -define void @index_delta(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { -; CHECK-LABEL: define void @index_delta( -; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { -; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1 -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]] -; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]] -; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD1]] to i64 -; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr i8, ptr [[INPUT]], i64 [[OFFSET]] -; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4 -; CHECK-NEXT: [[TMP:%.*]] = mul i64 [[OFFSET]], 3 -; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP]] -; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 -; CHECK-NEXT: ret void -; - - %add0 = add nsw i32 %b, 1 - %mul.1 = mul nsw i32 %add0, %n - %add.1 = add i32 %mul.1, %c - %offset = sext i32 %add.1 to i64 - %getElem = getelementptr i8, ptr %input, i64 %offset - store float %r, ptr %getElem, align 4 - %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset - store float %r, ptr %getElem.1, align 4 - - ret void -} - -; Stride Delta - ; foo(input[0]); ; foo(input[s]); ; foo(input[s * 2]); @@ -300,115 +263,3 @@ declare void @foo(ptr) declare void @bar(ptr) declare void @baz(ptr addrspace(1)) declare void @baz2(ptr addrspace(2)) - -define void @stride_const(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { -; CHECK-LABEL: define void @stride_const( -; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { -; CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[B]], [[N]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 [[MUL]], [[C]] -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD]], [[N]] -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[N]] -; CHECK-NEXT: [[OFFSET:%.*]] = sext i32 [[ADD_2]] to i64 -; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[OFFSET]] -; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 -; CHECK-NEXT: [[GETELEM_2:%.*]] = getelementptr i8, ptr [[GETELEM_1]], i64 16 -; CHECK-NEXT: store float [[R]], ptr [[GETELEM_2]], align 4 -; CHECK-NEXT: ret void -; - - %mul = mul nsw i32 %b, %n - %add = add i32 %mul, %c - %add.1 = add i32 %add, %n - %add.2 = add i32 %add.1, %n - - %offset = sext i32 %add.2 to i64 - %1 = getelementptr float, ptr %input, i64 %offset - store float %r, ptr %1, align 4 - - %offset3 = add i64 %offset, 4 - %2 = getelementptr float, ptr %input, i64 %offset3 - store float %r, ptr %2, align 4 - ret void -} - - -define void @stride_var(ptr %input, i32 %c, i32 %b, i32 %n, float %r) { -; CHECK-LABEL: define void @stride_var( -; CHECK-SAME: ptr [[INPUT:%.*]], i32 [[C:%.*]], i32 [[B:%.*]], i32 [[N:%.*]], float [[R:%.*]]) { -; CHECK-NEXT: [[ADD0:%.*]] = add nsw i32 [[B]], 1 -; CHECK-NEXT: [[MUL_1:%.*]] = mul nsw i32 [[ADD0]], [[N]] -; CHECK-NEXT: [[ADD1:%.*]] = add i32 [[MUL_1]], [[C]] -; CHECK-NEXT: [[I:%.*]] = sext i32 [[ADD1]] to i64 -; CHECK-NEXT: [[GETELEM:%.*]] = getelementptr float, ptr [[INPUT]], i64 [[I]] -; CHECK-NEXT: store float [[R]], ptr [[GETELEM]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[N]] to i64 -; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[TMP1]], 2 -; CHECK-NEXT: [[GETELEM_1:%.*]] = getelementptr inbounds i8, ptr [[GETELEM]], i64 [[TMP2]] -; CHECK-NEXT: store float [[R]], ptr [[GETELEM_1]], align 4 -; CHECK-NEXT: ret void -; -; Reuse getElem to compute getElem.1 and getElem.2 with variable offset n extracted from Stride - - %add0 = add nsw i32 %b, 1 - %mul.1 = mul nsw i32 %add0, %n - %add.1 = add i32 %mul.1, %c - %offset = sext i32 %add.1 to i64 - %getElem = getelementptr float, ptr %input, i64 %offset - store float %r, ptr %getElem, align 4 - - %mul = mul nsw i32 %b, %n - %add = add i32 %mul, %c - %add.11 = add i32 %add, %n - %add.2 = add i32 %add.11, %n - %offset1 = sext i32 %add.2 to i64 - %getElem.1 = getelementptr inbounds float, ptr %input, i64 %offset1 - store float %r, ptr %getElem.1, align 4 - - ret void -} - -; Base Delta - -%struct.B = type { i16 } -%struct.A = type { %struct.B, %struct.B } - -define void @base_const(i32 %a, ptr %base, i16 %r) { -; Reuse getElem1 to compute getElem2 -; CHECK-LABEL: define void @base_const( -; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: store i16 [[R]], ptr [[GEP1]], align 2 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 2 -; CHECK-NEXT: store i16 [[R]], ptr [[GEP2]], align 2 -; CHECK-NEXT: ret void -; - - %1 = sext i32 %a to i64 - %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 - store i16 %r, ptr %getElem1, align 2 - %getElem2 = getelementptr inbounds %struct.A, ptr %base, i64 %1, i32 1 - store i16 %r, ptr %getElem2, align 2 - ret void -} - -define void @base_var(i32 %a, ptr %base, i16 %r, i64 %n) { -; Reuse getElem1 to compute getElem2 -; CHECK-LABEL: define void @base_var( -; CHECK-SAME: i32 [[A:%.*]], ptr [[BASE:%.*]], i16 [[R:%.*]], i64 [[N:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[A]] to i64 -; CHECK-NEXT: [[GETELEM1:%.*]] = getelementptr inbounds [[STRUCT_A:%.*]], ptr [[BASE]], i64 [[TMP1]] -; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM1]], align 2 -; CHECK-NEXT: [[GETELEM2:%.*]] = getelementptr inbounds i8, ptr [[GETELEM1]], i64 [[N]] -; CHECK-NEXT: store i16 [[R]], ptr [[GETELEM2]], align 2 -; CHECK-NEXT: ret void -; - - %1 = sext i32 %a to i64 - %base1 = getelementptr inbounds i8, ptr %base, i64 %n - %getElem1 = getelementptr inbounds %struct.A, ptr %base, i64 %1 - store i16 %r, ptr %getElem1, align 2 - %getElem2 = getelementptr inbounds %struct.A, ptr %base1, i64 %1 - store i16 %r, ptr %getElem2, align 2 - ret void -} From 6968e4908a3409abf161ba135667953124347a7a Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 25 Nov 2025 19:26:42 +0000 Subject: [PATCH 2/3] Fix lit tests. --- .../AMDGPU/promote-constOffset-to-imm.ll | 1 + .../AMDGPU/splitkit-getsubrangeformask.ll | 50 +++++++++---------- 2 files changed, 24 insertions(+), 27 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll index 3259ddf559d77..3329c9a761900 100644 --- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll +++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll @@ -720,6 +720,7 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) { ; GFX90A-NEXT: v_add_co_u32_e32 v2, vcc, 0x5000, v1 ; GFX90A-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x7f +; GFX90A-NEXT: v_pk_mov_b32 v[4:5], 0, 0 ; GFX90A-NEXT: s_movk_i32 s0, 0xd000 ; GFX90A-NEXT: s_movk_i32 s1, 0xe000 ; GFX90A-NEXT: s_movk_i32 s2, 0xf000 diff --git a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll index d6d639c5e496b..67dae136afb72 100644 --- a/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ b/llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -146,11 +146,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ASHR_I32_5:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_4]], 31, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_18:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_4]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_18:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_5]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.273, align 8, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM15:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.157, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM14]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM11]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_18]], 168, 0 :: (invariant load (s32) from %ir.275, align 8, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM14:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_8]], 576, 0 :: (invariant load (s128) from %ir.159, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN11:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM13]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN12:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM9]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN13:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM10]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]].sub2:sgpr_128 = S_MOV_B32 -1 ; CHECK-NEXT: [[COPY15:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] @@ -169,11 +169,11 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_14:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM4]], -467, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_19:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_5]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_19:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_6]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.282, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM18]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.205, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.211, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM1:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_19]], 168, 0 :: (invariant load (s64) from %ir.284, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET2:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM16]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_DWORD_OFFSET3:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[S_LOAD_DWORDX4_IMM17]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM18:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_12]], 0, 0 :: (invariant load (s128) from %ir.207, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM19:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_13]], 0, 0 :: (invariant load (s128) from %ir.213, addrspace 4) ; CHECK-NEXT: [[COPY16:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM20:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_14]], 0, 0 :: (invariant load (s128) from %ir.218, addrspace 4) ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM21:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_15]], 0, 0 :: (invariant load (s128) from %ir.223, addrspace 4) @@ -190,26 +190,22 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_15:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM5]], -468, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_20:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_6]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_20:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.293, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX2_IMM2:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[S_ADD_U32_20]], 168, 0 :: (invariant load (s64) from %ir.295, addrspace 4) ; CHECK-NEXT: [[COPY17:%[0-9]+]]:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM]] ; CHECK-NEXT: [[S_AND_B32_1:%[0-9]+]]:sreg_32 = S_AND_B32 [[S_LOAD_DWORDX2_IMM2]].sub1, 65535, implicit-def dead $scc ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub0:sgpr_128 = COPY [[S_LOAD_DWORDX2_IMM2]].sub0 ; CHECK-NEXT: [[COPY17:%[0-9]+]].sub1:sgpr_128 = COPY [[S_AND_B32_1]] ; CHECK-NEXT: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[COPY17]], 0, 0 :: (dereferenceable invariant load (s32)) - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.256, addrspace 4) - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %469:sreg_64, 0, 0 :: (invariant load (s32) from `ptr addrspace(4) poison`, addrspace 4) - ; CHECK-NEXT: KILL [[S_ADD_U32_16]].sub0, [[S_ADD_U32_16]].sub1 - ; CHECK-NEXT: KILL undef %469:sreg_64 - ; CHECK-NEXT: KILL [[COPY17]].sub0_sub1_sub2, [[COPY17]].sub3 + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM22:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_16]], 160, 0 :: (invariant load (s128) from %ir.258, addrspace 4) ; CHECK-NEXT: [[S_LSHL_B32_7:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY14]], 3, implicit-def dead $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.265, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_17]], 160, 0 :: (invariant load (s128) from %ir.267, addrspace 4) ; CHECK-NEXT: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_7]], 31, implicit-def dead $scc ; CHECK-NEXT: [[S_ADD_I32_16:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM6]], -469, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_21:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY]].sub0, [[S_LSHL_B32_7]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_21:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %57:sreg_32, [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.305, align 8, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[S_ADD_U32_21]], 168, 0 :: (invariant load (s32) from %ir.307, align 8, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM22]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM22]] ; CHECK-NEXT: [[S_AND_B32_2:%[0-9]+]]:sreg_32 = S_AND_B32 %73.sub0, 65535, implicit-def dead $scc @@ -225,17 +221,17 @@ define amdgpu_gs void @_amdgpu_gs_main(i32 inreg %primShaderTableAddrLow, <31 x ; CHECK-NEXT: [[S_ADD_I32_22:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM7]], -473, implicit-def dead $scc ; CHECK-NEXT: undef [[S_ADD_U32_22:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_22:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.323, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_22]], 96, 0 :: (invariant load (s128) from %ir.325, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_23:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_1]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_23:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_1]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.329, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM25:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_23]], 96, 0 :: (invariant load (s128) from %ir.331, addrspace 4) ; CHECK-NEXT: undef [[S_ADD_U32_24:%[0-9]+]].sub0:sreg_64 = S_ADD_U32 [[COPY1]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK-NEXT: [[S_ADD_U32_24:%[0-9]+]].sub1:sreg_64 = S_ADDC_U32 undef %33:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.335, addrspace 4) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) - ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] + ; CHECK-NEXT: [[S_LOAD_DWORDX4_IMM26:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM [[S_ADD_U32_24]], 96, 0 :: (invariant load (s128) from %ir.337, addrspace 4) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 8) + ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM24]] ; CHECK-NEXT: KILL [[V_MOV_B32_e32_]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM25]] ; CHECK-NEXT: KILL [[S_LOAD_DWORDX4_IMM26]] From 11d93a80b3f496935f828f6e33b86da3977ff292 Mon Sep 17 00:00:00 2001 From: Alan Li Date: Tue, 25 Nov 2025 19:52:09 +0000 Subject: [PATCH 3/3] regen checks. --- llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll index 30f52773bbce5..a0aee6c80703f 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vscnt.ll @@ -26,17 +26,17 @@ define amdgpu_kernel void @barrier_vmcnt_global(ptr addrspace(1) %arg) { ; GFX9-LABEL: barrier_vmcnt_global: ; GFX9: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 2, v0 -; GFX9-NEXT: v_add_u32_e32 v2, 1, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v3, v1, s[0:1] -; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[1:2] -; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: global_load_dword v2, v1, s[0:1] +; GFX9-NEXT: v_add_u32_e32 v1, 1, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_lshrrev_b64 v[0:1], 30, v[0:1] +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v2, v1, vcc +; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v3, v1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_barrier -; GFX9-NEXT: global_store_dword v[0:1], v3, off +; GFX9-NEXT: global_store_dword v[0:1], v2, off ; GFX9-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()