-
Notifications
You must be signed in to change notification settings - Fork 10.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[AMDGPU] Add IR LiveReg type-based optimization #66838
base: main
Are you sure you want to change the base?
Conversation
@llvm/pr-subscribers-llvm-globalisel @llvm/pr-subscribers-backend-amdgpu ChangesNOTE: This commit is part of a stack which spans across phabricator. The PR is meant only for the top of the stack ([AMDGPU] Add IR LiveReg type-based optimization). As suggested in #66134, this adds the IR level logic to coerce the type of illegal vectors which have live ranges that span across basic blocks. The issue is that local ISel will emit CopyToReg / CopyFromReg pairs for live ranges spanning basic blocks. For illegal vector types, the DAGBuilder will legalize by scalarizing the vector, then widening each scalar, and passing each scalar via a separate physical register. See https://godbolt.org/z/Y7MhcjGE8 for a demo of the issue. This feature identifies cases like these, and inserts bitcasts between the def of the illegal vector and the uses in different blocks. This results in avoiding the scalarization process and an ability to pack the bits into fewer registers -- for example, we now use 2 VGPR for a v8i8 instead of 8. Patch is 637.69 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/66838.diff 18 Files Affected:
diff --git a/llvm/include/llvm/CodeGen/ByteProvider.h b/llvm/include/llvm/CodeGen/ByteProvider.h
index 3187b4e68c56f3a..99ae8607c0b2071 100644
--- a/llvm/include/llvm/CodeGen/ByteProvider.h
+++ b/llvm/include/llvm/CodeGen/ByteProvider.h
@@ -32,6 +32,11 @@ template <typename ISelOp> class ByteProvider {
ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset)
: Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset) {}
+ ByteProvider(std::optional<ISelOp> Src, int64_t DestOffset, int64_t SrcOffset,
+ std::optional<bool> IsSigned)
+ : Src(Src), DestOffset(DestOffset), SrcOffset(SrcOffset),
+ IsSigned(IsSigned) {}
+
// TODO -- use constraint in c++20
// Does this type correspond with an operation in selection DAG
template <typename T> class is_op {
@@ -61,6 +66,9 @@ template <typename ISelOp> class ByteProvider {
// DestOffset
int64_t SrcOffset = 0;
+ // Whether or not Src be treated as signed
+ std::optional<bool> IsSigned;
+
ByteProvider() = default;
static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
@@ -70,6 +78,14 @@ template <typename ISelOp> class ByteProvider {
return ByteProvider(Val, ByteOffset, VectorOffset);
}
+ static ByteProvider getSrc(std::optional<ISelOp> Val, int64_t ByteOffset,
+ int64_t VectorOffset,
+ std::optional<bool> IsSigned) {
+ static_assert(is_op<ISelOp>().value,
+ "ByteProviders must contain an operation in selection DAG.");
+ return ByteProvider(Val, ByteOffset, VectorOffset, IsSigned);
+ }
+
static ByteProvider getConstantZero() {
return ByteProvider<ISelOp>(std::nullopt, 0, 0);
}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 4cce34bdeabcf44..b50379e98d0f6b5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -106,6 +106,7 @@ class AMDGPUCodeGenPrepareImpl
Module *Mod = nullptr;
const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;
+ bool UsesGlobalISel = false;
bool HasFP32DenormalFlush = false;
bool FlowChanged = false;
mutable Function *SqrtF32 = nullptr;
@@ -341,6 +342,85 @@ class AMDGPUCodeGenPrepare : public FunctionPass {
StringRef getPassName() const override { return "AMDGPU IR optimizations"; }
};
+class LiveRegConversion {
+private:
+ // The instruction which defined the original virtual register used across
+ // blocks
+ Instruction *LiveRegDef;
+ // The original type
+ Type *OriginalType;
+ // The desired type
+ Type *NewType;
+ // The instruction sequence that converts the virtual register, to be used
+ // instead of the original
+ std::optional<Instruction *> Converted;
+ // The builder used to build the conversion instruction
+ IRBuilder<> ConvertBuilder;
+
+public:
+ // The instruction which defined the original virtual register used across
+ // blocks
+ Instruction *getLiveRegDef() { return LiveRegDef; }
+ // The original type
+ Type *getOriginalType() { return OriginalType; }
+ // The desired type
+ Type *getNewType() { return NewType; }
+ void setNewType(Type *NewType) { this->NewType = NewType; }
+ // The instruction that conerts the virtual register, to be used instead of
+ // the original
+ std::optional<Instruction *> &getConverted() { return Converted; }
+ void setConverted(Instruction *Converted) { this->Converted = Converted; }
+ // The builder used to build the conversion instruction
+ IRBuilder<> &getConverBuilder() { return ConvertBuilder; }
+ // Do we have a instruction sequence which convert the original virtual
+ // register
+ bool hasConverted() { return Converted.has_value(); }
+
+ LiveRegConversion(Instruction *LiveRegDef, BasicBlock *InsertBlock,
+ BasicBlock::iterator InsertPt)
+ : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+ ConvertBuilder(InsertBlock, InsertPt) {}
+ LiveRegConversion(Instruction *LiveRegDef, Type *NewType,
+ BasicBlock *InsertBlock, BasicBlock::iterator InsertPt)
+ : LiveRegDef(LiveRegDef), OriginalType(LiveRegDef->getType()),
+ NewType(NewType), ConvertBuilder(InsertBlock, InsertPt) {}
+};
+
+class LiveRegOptimizer {
+private:
+ Module *Mod = nullptr;
+ // The scalar type to convert to
+ Type *ConvertToScalar;
+ // Holds the collection of PHIs with their pending new operands
+ SmallVector<std::pair<Instruction *,
+ SmallVector<std::pair<Instruction *, BasicBlock *>, 4>>,
+ 4>
+ PHIUpdater;
+
+public:
+ // Should the def of the instruction be converted if it is live across blocks
+ bool shouldReplaceUses(const Instruction &I);
+ // Convert the virtual register to the compatible vector of legal type
+ void convertToOptType(LiveRegConversion &LR);
+ // Convert the virtual register back to the original type, stripping away
+ // the MSBs in cases where there was an imperfect fit (e.g. v2i32 -> v7i8)
+ void convertFromOptType(LiveRegConversion &LR);
+ // Get a vector of desired scalar type that is compatible with the original
+ // vector. In cases where there is no bitsize equivalent using a legal vector
+ // type, we pad the MSBs (e.g. v7i8 -> v2i32)
+ Type *getCompatibleType(Instruction *InstToConvert);
+ // Find and replace uses of the virtual register in different block with a
+ // newly produced virtual register of legal type
+ bool replaceUses(Instruction &I);
+ // Replace the collected PHIs with newly produced incoming values. Replacement
+ // is only done if we have a replacement for each original incoming value.
+ bool replacePHIs();
+
+ LiveRegOptimizer(Module *Mod) : Mod(Mod) {
+ ConvertToScalar = Type::getInt32Ty(Mod->getContext());
+ }
+};
+
} // end anonymous namespace
bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
@@ -358,6 +438,7 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
Next = std::next(I);
MadeChange |= visit(*I);
+ I->getType();
if (Next != E) { // Control flow changed
BasicBlock *NextInstBB = Next->getParent();
@@ -369,9 +450,269 @@ bool AMDGPUCodeGenPrepareImpl::run(Function &F) {
}
}
}
+
+ // GlobalISel should directly use the values, and do not need to emit
+ // CopyTo/CopyFrom Regs across blocks
+ if (UsesGlobalISel)
+ return MadeChange;
+
+ // "Optimize" the virtual regs that cross basic block boundaries. In such
+ // cases, vectors of illegal types will be scalarized and widened, with each
+ // scalar living in its own physical register. The optimization converts the
+ // vectors to equivalent vectors of legal type (which are convereted back
+ // before uses in subsequenmt blocks), to pack the bits into fewer physical
+ // registers (used in CopyToReg/CopyFromReg pairs).
+ LiveRegOptimizer LRO(Mod);
+ for (auto &BB : F) {
+ for (auto &I : BB) {
+ if (!LRO.shouldReplaceUses(I))
+ continue;
+ MadeChange |= LRO.replaceUses(I);
+ }
+ }
+
+ MadeChange |= LRO.replacePHIs();
+ return MadeChange;
+}
+
+bool LiveRegOptimizer::replaceUses(Instruction &I) {
+ bool MadeChange = false;
+
+ struct ConvertUseInfo {
+ Instruction *Converted;
+ SmallVector<Instruction *, 4> Users;
+ };
+ DenseMap<BasicBlock *, ConvertUseInfo> UseConvertTracker;
+
+ LiveRegConversion FromLRC(
+ &I, I.getParent(),
+ static_cast<BasicBlock::iterator>(std::next(I.getIterator())));
+ FromLRC.setNewType(getCompatibleType(FromLRC.getLiveRegDef()));
+ for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) {
+
+ if (auto UserInst = dyn_cast<Instruction>(*IUser)) {
+ if (UserInst->getParent() != I.getParent()) {
+ LLVM_DEBUG(dbgs() << *UserInst << "\n\tUses "
+ << *FromLRC.getOriginalType()
+ << " from previous block. Needs conversion\n");
+ convertToOptType(FromLRC);
+ if (!FromLRC.hasConverted())
+ continue;
+ // If it is a PHI node, just create and collect the new operand. We can
+ // only replace the PHI node once we have converted all the operands
+ if (auto PhiInst = dyn_cast<PHINode>(UserInst)) {
+ for (unsigned Idx = 0; Idx < PhiInst->getNumIncomingValues(); Idx++) {
+ auto IncVal = PhiInst->getIncomingValue(Idx);
+ if (&I == dyn_cast<Instruction>(IncVal)) {
+ auto IncBlock = PhiInst->getIncomingBlock(Idx);
+ auto PHIOps = find_if(
+ PHIUpdater,
+ [&UserInst](
+ std::pair<Instruction *,
+ SmallVector<
+ std::pair<Instruction *, BasicBlock *>, 4>>
+ &Entry) { return Entry.first == UserInst; });
+
+ if (PHIOps == PHIUpdater.end())
+ PHIUpdater.push_back(
+ {UserInst, {{*FromLRC.getConverted(), IncBlock}}});
+ else
+ PHIOps->second.push_back({*FromLRC.getConverted(), IncBlock});
+
+ break;
+ }
+ }
+ continue;
+ }
+
+ // Do not create multiple conversion sequences if there are multiple
+ // uses in the same block
+ if (UseConvertTracker.contains(UserInst->getParent())) {
+ UseConvertTracker[UserInst->getParent()].Users.push_back(UserInst);
+ LLVM_DEBUG(dbgs() << "\tUser already has access to converted def\n");
+ continue;
+ }
+
+ LiveRegConversion ToLRC(*FromLRC.getConverted(), I.getType(),
+ UserInst->getParent(),
+ static_cast<BasicBlock::iterator>(
+ UserInst->getParent()->getFirstNonPHIIt()));
+ convertFromOptType(ToLRC);
+ assert(ToLRC.hasConverted());
+ UseConvertTracker[UserInst->getParent()] = {*ToLRC.getConverted(),
+ {UserInst}};
+ }
+ }
+ }
+
+ // Replace uses of with in a separate loop that is not dependent upon the
+ // state of the uses
+ for (auto &Entry : UseConvertTracker) {
+ for (auto &UserInst : Entry.second.Users) {
+ LLVM_DEBUG(dbgs() << *UserInst
+ << "\n\tNow uses: " << *Entry.second.Converted << "\n");
+ UserInst->replaceUsesOfWith(&I, Entry.second.Converted);
+ MadeChange = true;
+ }
+ }
+ return MadeChange;
+}
+
+bool LiveRegOptimizer::replacePHIs() {
+ bool MadeChange = false;
+ for (auto Ele : PHIUpdater) {
+ auto ThePHINode = dyn_cast<PHINode>(Ele.first);
+ assert(ThePHINode);
+ auto NewPHINodeOps = Ele.second;
+ LLVM_DEBUG(dbgs() << "Attempting to replace: " << *ThePHINode << "\n");
+ // If we have conveted all the required operands, then do the replacement
+ if (ThePHINode->getNumIncomingValues() == NewPHINodeOps.size()) {
+ IRBuilder<> Builder(Ele.first);
+ auto NPHI = Builder.CreatePHI(NewPHINodeOps[0].first->getType(),
+ NewPHINodeOps.size());
+ for (auto IncVals : NewPHINodeOps) {
+ NPHI->addIncoming(IncVals.first, IncVals.second);
+ LLVM_DEBUG(dbgs() << " Using: " << *IncVals.first
+ << " For: " << IncVals.second->getName() << "\n");
+ }
+ LLVM_DEBUG(dbgs() << "Sucessfully replaced with " << *NPHI << "\n");
+ LiveRegConversion ToLRC(NPHI, ThePHINode->getType(),
+ ThePHINode->getParent(),
+ static_cast<BasicBlock::iterator>(
+ ThePHINode->getParent()->getFirstNonPHIIt()));
+ convertFromOptType(ToLRC);
+ assert(ToLRC.hasConverted());
+ Ele.first->replaceAllUsesWith(*ToLRC.getConverted());
+ // The old PHI is no longer used
+ ThePHINode->eraseFromParent();
+ MadeChange = true;
+ }
+ }
return MadeChange;
}
+Type *LiveRegOptimizer::getCompatibleType(Instruction *InstToConvert) {
+ auto OriginalType = InstToConvert->getType();
+ assert(OriginalType->getScalarSizeInBits() <=
+ ConvertToScalar->getScalarSizeInBits());
+ auto VTy = dyn_cast<VectorType>(OriginalType);
+ if (!VTy)
+ return ConvertToScalar;
+
+ auto OriginalSize =
+ VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+ auto ConvertScalarSize = ConvertToScalar->getScalarSizeInBits();
+ auto ConvertEltCount =
+ (OriginalSize + ConvertScalarSize - 1) / ConvertScalarSize;
+
+ return VectorType::get(Type::getIntNTy(Mod->getContext(), ConvertScalarSize),
+ llvm::ElementCount::getFixed(ConvertEltCount));
+}
+
+void LiveRegOptimizer::convertToOptType(LiveRegConversion &LR) {
+ if (LR.hasConverted()) {
+ LLVM_DEBUG(dbgs() << "\tAlready has converted def\n");
+ return;
+ }
+
+ auto VTy = dyn_cast<VectorType>(LR.getOriginalType());
+ assert(VTy);
+ auto NewVTy = dyn_cast<VectorType>(LR.getNewType());
+ assert(NewVTy);
+
+ auto V = static_cast<Value *>(LR.getLiveRegDef());
+ auto OriginalSize =
+ VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+ auto NewSize =
+ NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+ auto &Builder = LR.getConverBuilder();
+
+ // If there is a bitsize match, we can fit the old vector into a new vector of
+ // desired type
+ if (OriginalSize == NewSize) {
+ LR.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+ LLVM_DEBUG(dbgs() << "\tConverted def to "
+ << *(*LR.getConverted())->getType() << "\n");
+ return;
+ }
+
+ // If there is a bitsize mismatch, we must use a wider vector
+ assert(NewSize > OriginalSize);
+ auto ExpandedVecElementCount =
+ llvm::ElementCount::getFixed(NewSize / VTy->getScalarSizeInBits());
+
+ SmallVector<int, 8> ShuffleMask;
+ for (unsigned I = 0; I < VTy->getElementCount().getFixedValue(); I++)
+ ShuffleMask.push_back(I);
+
+ for (uint64_t I = VTy->getElementCount().getFixedValue();
+ I < ExpandedVecElementCount.getFixedValue(); I++)
+ ShuffleMask.push_back(VTy->getElementCount().getFixedValue());
+
+ auto ExpandedVec =
+ dyn_cast<Instruction>(Builder.CreateShuffleVector(V, ShuffleMask));
+ LR.setConverted(
+ dyn_cast<Instruction>(Builder.CreateBitCast(ExpandedVec, NewVTy)));
+ LLVM_DEBUG(dbgs() << "\tConverted def to " << *(*LR.getConverted())->getType()
+ << "\n");
+ return;
+}
+
+void LiveRegOptimizer::convertFromOptType(LiveRegConversion &LRC) {
+ auto VTy = dyn_cast<VectorType>(LRC.getOriginalType());
+ assert(VTy);
+ auto NewVTy = dyn_cast<VectorType>(LRC.getNewType());
+ assert(NewVTy);
+
+ auto V = static_cast<Value *>(LRC.getLiveRegDef());
+ auto OriginalSize =
+ VTy->getScalarSizeInBits() * VTy->getElementCount().getFixedValue();
+ auto NewSize =
+ NewVTy->getScalarSizeInBits() * NewVTy->getElementCount().getFixedValue();
+
+ auto &Builder = LRC.getConverBuilder();
+
+ // If there is a bitsize match, we simply convert back to the original type
+ if (OriginalSize == NewSize) {
+ LRC.setConverted(dyn_cast<Instruction>(Builder.CreateBitCast(V, NewVTy)));
+ LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted()
+ << "\n");
+ return;
+ }
+
+ // If there is a bitsize mismatch, we have used a wider vector and must strip
+ // the MSBs to convert back to the original type
+ assert(OriginalSize > NewSize);
+ auto ExpandedVecElementCount = llvm::ElementCount::getFixed(
+ OriginalSize / NewVTy->getScalarSizeInBits());
+ auto ExpandedVT = VectorType::get(
+ Type::getIntNTy(Mod->getContext(), NewVTy->getScalarSizeInBits()),
+ ExpandedVecElementCount);
+ auto Converted = dyn_cast<Instruction>(
+ Builder.CreateBitCast(LRC.getLiveRegDef(), ExpandedVT));
+
+ auto NarrowElementCount = NewVTy->getElementCount().getFixedValue();
+ SmallVector<int, 8> ShuffleMask;
+ for (uint64_t I = 0; I < NarrowElementCount; I++)
+ ShuffleMask.push_back(I);
+
+ auto NarrowVec = dyn_cast<Instruction>(
+ Builder.CreateShuffleVector(Converted, ShuffleMask));
+ LRC.setConverted(dyn_cast<Instruction>(NarrowVec));
+ LLVM_DEBUG(dbgs() << "\tProduced for user: " << **LRC.getConverted() << "\n");
+ return;
+}
+
+bool LiveRegOptimizer::shouldReplaceUses(const Instruction &I) {
+ // Vectors of illegal types are copied across blocks in an efficient manner.
+ // They are scalarized and widened to legal scalars. In such cases, we can do
+ // better by using legal vector types
+ auto IType = I.getType();
+ return IType->isVectorTy() && IType->getScalarSizeInBits() < 16 &&
+ !I.getType()->getScalarType()->isPointerTy();
+}
+
unsigned AMDGPUCodeGenPrepareImpl::getBaseElementBitWidth(const Type *T) const {
assert(needsPromotionToI32(T) && "T does not need promotion to i32");
@@ -2230,6 +2571,7 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
Impl.ST = &TM.getSubtarget<GCNSubtarget>(F);
Impl.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
Impl.UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
+ Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
Impl.DT = DTWP ? &DTWP->getDomTree() : nullptr;
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
@@ -2250,6 +2592,7 @@ PreservedAnalyses AMDGPUCodeGenPreparePass::run(Function &F,
Impl.UA = &FAM.getResult<UniformityInfoAnalysis>(F);
Impl.DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
Impl.HasUnsafeFPMath = hasUnsafeFPMath(F);
+ Impl.UsesGlobalISel = TM.Options.EnableGlobalISel;
SIModeRegisterDefaults Mode(F);
Impl.HasFP32DenormalFlush =
Mode.FP32Denormals == DenormalMode::getPreserveSign();
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 1c85ec3f9f5212f..403d61f1b836fab 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -10652,23 +10652,27 @@ SDValue SITargetLowering::performAndCombine(SDNode *N,
// performed.
static const std::optional<ByteProvider<SDValue>>
calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
+ std::optional<bool> IsSigned = std::nullopt,
unsigned Depth = 0) {
// We may need to recursively traverse a series of SRLs
if (Depth >= 6)
return std::nullopt;
- auto ValueSize = Op.getValueSizeInBits();
- if (ValueSize != 8 && ValueSize != 16 && ValueSize != 32)
+ if (Op.getValueSizeInBits() < 8)
return std::nullopt;
switch (Op->getOpcode()) {
case ISD::TRUNCATE: {
- return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+ Depth + 1);
}
case ISD::SIGN_EXTEND:
case ISD::ZERO_EXTEND:
case ISD::SIGN_EXTEND_INREG: {
+ IsSigned = IsSigned.value_or(false) ||
+ Op->getOpcode() == ISD::SIGN_EXTEND ||
+ Op->getOpcode() == ISD::SIGN_EXTEND_INREG;
SDValue NarrowOp = Op->getOperand(0);
auto NarrowVT = NarrowOp.getValueType();
if (Op->getOpcode() == ISD::SIGN_EXTEND_INREG) {
@@ -10681,7 +10685,8 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
if (SrcIndex >= NarrowByteWidth)
return std::nullopt;
- return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+ Depth + 1);
}
case ISD::SRA:
@@ -10697,12 +10702,38 @@ calculateSrcByte(const SDValue Op, uint64_t DestByte, uint64_t SrcIndex = 0,
SrcIndex += BitShift / 8;
- return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, Depth + 1);
+ return calculateSrcByte(Op->getOperand(0), DestByte, SrcIndex, IsSigned,
+ Depth + 1);
}
- default: {
+ case ISD::EXTRACT_VECTOR_ELT: {
+ auto IdxOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
+ if (!IdxOp)
+ return std::nullopt;
+ auto VecIdx = IdxOp->getZExtValue();
+ auto ScalarSize = Op.getScalarValueS...
[truncated]
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs a rebase to show the already pushed dependencies
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Needs rebase, most of this stuff was already merged?
4ee0c89
to
1c502bc
Compare
Just rebase to reflect the current outstanding work -- still need to address other comments. |
1c502bc
to
71526ba
Compare
Change-Id: Ide8a46cdaf1d2d82cbd5296c998a5c8fd41fce80
71526ba
to
5efb955
Compare
Decoupling from "[AMDGPU]: Accept constant zero bytes in v_perm OrCombine" as that is taking longer than expected, and this has priority. As a result, in the exotic cases (e.g. v3i8), we may produce suboptimal codegen, but, for the normal case, codegen is much improved. |
…eCodeGenPrepare after CodeSinking + Integrate the loops Change-Id: Iac0baf0ab9e523bf303585b545f060293e6fb4f0
Change-Id: I1b461e3194a27e5e3c45500cae0ef5d4d6540d59
Change-Id: Ia56d86e1acf191d19f6fc43ae780de9bb5118ba9
Change-Id: I8eeacb7d4292a215bb0540e8e7dd12ab7547d058
Change-Id: I94504f26819c45de7496b39fee8031bcda0f29fb
Change-Id: I4383004240dc0365de6e67b12dc9ea5b609826d2
Change-Id: I07bf0cf4537bd3b148dc4ee3b785b989f0aac8b0
Change-Id: I83ae012da3118b0a40fb8a80be5029ce5bd2d78a
Change-Id: Idbfbbadfc1c3cee6cbd1a814b3446628dcce4394
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should include a test where a predecessor block has a repeated successor (i.e. it has multiple incoming switch cases)
Change-Id: I244784728ff1b4363ff066f8c5a6fa6d03c2a4d5
How does this compare to the usage of
|
Interesting -- I did not see that, thanks. That feature does have the same structure as the feature in this PR, but it seems for a different purpose: to help fold away bitcasts. It will insert bitcasts if it finds that the bitcasts defining incoming values and bitcast uses of the PHI are all the same type. The target hook is invoked in special cases: when there are no bitcast def/uses, or when the bitcasts are def by a load (for bitcast def) / used by a store (for bitcast use of PHI). It is mainly due to this special condition entry that the hook will not work for the feature in this PR. Not only that special condition, but the generic codegenprepare optimizePhiType is disabled for vector types. Of course we could remove the conditions to the target hook, but this feels like misusing the hook since the features are conceptually different and we will have a phase ordering problem (code sinking will fold the newly inserted casts). |
The implementation is still a lot simpler, if the heuristic and placement is different. Does copying the same technique, and using the existing ValueToValueMap work? |
IRBuilder<> ConvertBuilder; | ||
|
||
public: | ||
// The instruction which defined the original virtual register used across |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
/// for doxygen comments
ConversionCandidateInfo FromCCI(&I, I.getParent(), | ||
std::next(I.getIterator())); | ||
FromCCI.setNewType(getCompatibleType(FromCCI.getLiveRegDef())); | ||
for (auto IUser = I.user_begin(); IUser != I.user_end(); IUser++) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This can be a range loop
Type *OriginalType = InstToConvert->getType(); | ||
assert(OriginalType->getScalarSizeInBits() <= | ||
ConvertToScalar->getScalarSizeInBits()); | ||
VectorType *VTy = dyn_cast<VectorType>(OriginalType); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Probably should just use FixedVectorType, you assumed fixed below
We should check if PHI splitting is still needed in CGP with this |
typedef std::pair<Instruction *, BasicBlock *> IncomingPair; | ||
typedef std::pair<Instruction *, SmallVector<IncomingPair, 4>> PHIUpdateInfo; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use using?
return Entry.first == UserInst; | ||
}); | ||
|
||
if (PHIOps == PHIUpdater.end()) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Braces
if (auto PHI = dyn_cast<PHINode>(UserInst)) { | ||
for (unsigned Idx = 0; Idx < PHI->getNumIncomingValues(); Idx++) { | ||
Value *IncVal = PHI->getIncomingValue(Idx); | ||
if (&I == dyn_cast<Instruction>(IncVal)) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
don't think you need the dyn_cast
NOTE: This commit is part of a stack which spans across phabricator. The PR is meant only for the top of the stack ([AMDGPU] Add IR LiveReg type-based optimization).
As suggested in #66134, this adds the IR level logic to coerce the type of illegal vectors which have live ranges that span across basic blocks.
The issue is that local ISel will emit CopyToReg / CopyFromReg pairs for live ranges spanning basic blocks. For illegal vector types, the DAGBuilder will legalize by scalarizing the vector, then widening each scalar, and passing each scalar via a separate physical register. See https://godbolt.org/z/Y7MhcjGE8 for a demo of the issue.
This feature identifies cases like these, and inserts bitcasts between the def of the illegal vector and the uses in different blocks. This results in avoiding the scalarization process and an ability to pack the bits into fewer registers -- for example, we now use 2 VGPR for a v8i8 instead of 8.