diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def index 9449e70930913..1e730218722b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def +++ b/llvm/lib/Target/AMDGPU/AMDGPUPassRegistry.def @@ -67,6 +67,8 @@ FUNCTION_PASS("amdgpu-simplifylib", AMDGPUSimplifyLibCallsPass()) FUNCTION_PASS("amdgpu-unify-divergent-exit-nodes", AMDGPUUnifyDivergentExitNodesPass()) FUNCTION_PASS("amdgpu-usenative", AMDGPUUseNativeCallsPass()) +FUNCTION_PASS("amdgpu-vector-idiom", + AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)) FUNCTION_PASS("si-annotate-control-flow", SIAnnotateControlFlowPass(*static_cast(this))) #undef FUNCTION_PASS diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 92a587b5771b6..1249e25114e1f 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -34,6 +34,7 @@ #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "AMDGPUUnifyDivergentExitNodes.h" +#include "AMDGPUVectorIdiom.h" #include "AMDGPUWaitSGPRHazards.h" #include "GCNDPPCombine.h" #include "GCNIterativeScheduler.h" @@ -905,6 +906,12 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { EnablePromoteKernelArguments) FPM.addPass(AMDGPUPromoteKernelArgumentsPass()); + // Run vector-idiom canonicalization early (after inlining) and before + // infer-AS / SROA to maximize scalarization opportunities. + // Specify 32 bytes since the largest HIP vector types are double4 or + // long4. + FPM.addPass(AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32)); + // Add infer address spaces pass to the opt pipeline after inlining // but before SROA to increase SROA opportunities. FPM.addPass(InferAddressSpacesPass()); @@ -953,6 +960,8 @@ void AMDGPUTargetMachine::registerPassBuilderCallbacks(PassBuilder &PB) { if (EnableLowerModuleLDS) PM.addPass(AMDGPULowerModuleLDSPass(*this)); if (Level != OptimizationLevel::O0) { + PM.addPass(createModuleToFunctionPassAdaptor( + AMDGPUVectorIdiomCombinePass(/*MaxBytes=*/32))); // We only want to run this with O2 or higher since inliner and SROA // don't run in O1. if (Level != OptimizationLevel::O1) { diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp new file mode 100644 index 0000000000000..2703fff9f4e9a --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.cpp @@ -0,0 +1,519 @@ +//===- AMDGPUVectorIdiom.cpp ------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// AMDGPU-specific vector idiom canonicalizations to unblock SROA and +// subsequent scalarization/vectorization. +// +// Motivation: +// - HIP vector types are often modeled as structs and copied with memcpy. +// Address-level selects on such copies block SROA. Converting to value-level +// operations or splitting the CFG enables SROA to break aggregates, which +// unlocks scalarization/vectorization on AMDGPU. +// +// Example pattern: +// %src = select i1 %c, ptr %A, ptr %B +// call void @llvm.memcpy(ptr %dst, ptr %src, i32 16, i1 false) +// +// Objectives: +// - Canonicalize small memcpy patterns where source or destination is a select +// of pointers. +// - Prefer value-level selects (on loaded values) over address-level selects +// when safe. +// - When speculation is unsafe, split the CFG to isolate each arm. +// +// Assumptions: +// - Only handles non-volatile memcpy with constant length N where 0 < N <= +// MaxBytes (default 32). +// - Source and destination must be in the same address space. +// - Speculative loads are allowed only if a conservative alignment check +// passes. +// - No speculative stores are introduced. +// +// Transformations: +// - Source-select memcpy: attempt speculative loads -> value select -> single +// store. +// Fallback is CFG split with two memcpy calls. +// - Destination-select memcpy: always CFG split to avoid speculative stores. +// +// Run this pass early, before SROA. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUVectorIdiom.h" +#include "AMDGPU.h" + +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/Loads.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/PostDominators.h" +#include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/Analysis/ValueTracking.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/PatternMatch.h" +#include "llvm/IR/Type.h" +#include "llvm/InitializePasses.h" +#include "llvm/Pass.h" +#include "llvm/Support/Debug.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Local.h" + +using namespace llvm; +using namespace PatternMatch; + +#define DEBUG_TYPE "amdgpu-vector-idiom" + +namespace { + +static cl::opt + AMDGPUVectorIdiomEnable("amdgpu-vector-idiom-enable", + cl::desc("Enable pass AMDGPUVectorIdiom"), + cl::init(true)); + +// Selects an integer or integer-vector element type matching NBytes, using the +// minimum proven alignment to decide the widest safe element width. +// Assumptions: +// - Pointee types are opaque; the element choice is based solely on size and +// alignment. +// - Falls back to if wider lanes are not safe/aligned. +static Type *getIntOrVecTypeForSize(uint64_t NBytes, LLVMContext &Ctx, + Align MinProvenAlign = Align(1)) { + auto CanUseI64 = [&]() { return MinProvenAlign >= Align(8); }; + auto CanUseI32 = [&]() { return MinProvenAlign >= Align(4); }; + auto CanUseI16 = [&]() { return MinProvenAlign >= Align(2); }; + + if (NBytes == 32 && CanUseI64()) + return FixedVectorType::get(Type::getInt64Ty(Ctx), 4); + + if ((NBytes % 4) == 0 && CanUseI32()) + return FixedVectorType::get(Type::getInt32Ty(Ctx), NBytes / 4); + + if ((NBytes % 2) == 0 && CanUseI16()) + return FixedVectorType::get(Type::getInt16Ty(Ctx), NBytes / 2); + + return FixedVectorType::get(Type::getInt8Ty(Ctx), NBytes); +} + +static Align minAlign(Align A, Align B) { return A < B ? A : B; } + +// Checks if the underlying object of a memcpy operand is an alloca. +// This helps focus on scratch memory optimizations by filtering out +// memcpy operations that don't involve stack-allocated memory. +static bool hasAllocaUnderlyingObject(Value *V) { + Value *Underlying = getUnderlyingObject(V); + return isa(Underlying); +} + +// Checks if both pointer operands can be speculatively loaded for N bytes and +// computes the minimum alignment to use. +// Notes: +// - Intentionally conservative: relies on isDereferenceablePointer and +// getOrEnforceKnownAlignment. +// - AA/TLI are not used for deeper reasoning here. +// Emits verbose LLVM_DEBUG logs explaining why speculation is disallowed. +// Return false reasons include: either arm not dereferenceable or computed +// known alignment < 1. +static bool bothArmsSafeToSpeculateLoads(Value *A, Value *B, uint64_t Size, + Align &OutAlign, const DataLayout &DL, + AssumptionCache *AC, + const DominatorTree *DT, + Instruction *CtxI) { + APInt SizeAPInt(DL.getIndexTypeSizeInBits(A->getType()), Size); + if (!isDereferenceableAndAlignedPointer(B, Align(1), SizeAPInt, DL, CtxI, AC, + DT, nullptr)) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: false arm " + << "(B) not dereferenceable for " << Size + << " bytes at align(1)\n"); + LLVM_DEBUG(dbgs() << " false arm (B) value: " << *B << '\n'); + return false; + } + + Align AlignB = + llvm::getOrEnforceKnownAlignment(B, Align(1), DL, nullptr, AC, DT); + + if (AlignB < Align(1)) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known " + << "alignment of false arm (B) < 1: " << AlignB.value() + << '\n'); + return false; + } + + if (!isDereferenceableAndAlignedPointer(A, Align(1), SizeAPInt, DL, CtxI, AC, + DT, nullptr)) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: true arm " + << "(A) not dereferenceable for " << Size + << " bytes at align(1)\n"); + LLVM_DEBUG(dbgs() << " true arm (A) value: " << *A << '\n'); + return false; + } + + Align AlignA = + llvm::getOrEnforceKnownAlignment(A, Align(1), DL, nullptr, AC, DT); + + if (AlignA < Align(1)) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not speculating loads: known " + << "alignment of true arm (A) < 1: " << AlignA.value() + << '\n'); + return false; + } + + OutAlign = minAlign(AlignA, AlignB); + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Speculative loads allowed: " + << "minAlign=" << OutAlign.value() << '\n'); + return true; +} + +struct AMDGPUVectorIdiomImpl { + const unsigned MaxBytes; + bool CFGChanged = false; + + AMDGPUVectorIdiomImpl(unsigned MaxBytes) : MaxBytes(MaxBytes) {} + + // Rewrites memcpy when the source is a select of pointers. Prefers a + // value-level select (two loads + select + one store) if speculative loads + // are safe. Otherwise, falls back to a guarded CFG split with two memcpy + // calls. Assumptions: + // - Non-volatile, constant length, within MaxBytes. + // - Source and destination in the same address space. + bool transformSelectMemcpySource(MemCpyInst &MT, SelectInst &Sel, + const DataLayout &DL, + const DominatorTree *DT, + AssumptionCache *AC) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Considering memcpy(select-src): " + << MT << '\n'); + IRBuilder<> B(&MT); + Value *Dst = MT.getRawDest(); + Value *A = Sel.getTrueValue(); + Value *Bv = Sel.getFalseValue(); + + ConstantInt *LenCI = cast(MT.getLength()); + uint64_t N = LenCI->getLimitedValue(); + + if (Sel.isVolatile()) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Not rewriting: Select marked " + << "volatile (unexpected) in memcpy source\n"); + return false; + } + + // This is a null check - always use CFG split + Value *Cond = Sel.getCondition(); + ICmpInst *ICmp = dyn_cast(Cond); + if (ICmp && ICmp->isEquality() && + (isa(ICmp->getOperand(0)) || + isa(ICmp->getOperand(1)))) { + splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true); + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Null check pattern - " + "using CFG split\n"); + return true; + } + + Align DstAlign = MaybeAlign(MT.getDestAlign()).valueOrOne(); + Align AlignAB; + bool CanSpeculate = false; + + const CallBase &CB = MT; + const unsigned SrcArgIdx = 1; + uint64_t DerefBytes = CB.getParamDereferenceableBytes(SrcArgIdx); + bool HasDerefOrNull = + CB.paramHasAttr(SrcArgIdx, Attribute::DereferenceableOrNull); + bool HasNonNull = CB.paramHasAttr(SrcArgIdx, Attribute::NonNull); + MaybeAlign SrcParamAlign = CB.getParamAlign(SrcArgIdx); + Align ProvenSrcAlign = + SrcParamAlign.value_or(MaybeAlign(MT.getSourceAlign()).valueOrOne()); + + if (DerefBytes > 0) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param attrs: " + << "dereferenceable(" << DerefBytes << ")" + << (HasDerefOrNull ? " (or null)" : "") + << (HasNonNull ? ", nonnull" : "") << ", align " + << ProvenSrcAlign.value() << '\n'); + if (DerefBytes >= N && (!HasDerefOrNull || HasNonNull)) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Using memcpy source operand " + << "attributes at this use; accepting speculation\n"); + CanSpeculate = true; + AlignAB = ProvenSrcAlign; + } else { + LLVM_DEBUG( + dbgs() << "[AMDGPUVectorIdiom] Source param attrs not strong " + << "enough for speculation: need dereferenceable(" << N + << ") and nonnull; got dereferenceable(" << DerefBytes << ")" + << (HasDerefOrNull ? " (or null)" : "") + << (HasNonNull ? ", nonnull" : "") << '\n'); + } + } else { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] memcpy source param has no " + << "dereferenceable bytes attribute; align " + << ProvenSrcAlign.value() << '\n'); + } + if (!CanSpeculate) + CanSpeculate = + bothArmsSafeToSpeculateLoads(A, Bv, N, AlignAB, DL, AC, DT, &MT); + + if (CanSpeculate) { + Align MinAlign = std::min(AlignAB, DstAlign); + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-src) " + << "with value-level select; N=" << N + << " minAlign=" << MinAlign.value() << '\n'); + + Type *Ty = getIntOrVecTypeForSize(N, B.getContext(), MinAlign); + + LoadInst *LA = B.CreateAlignedLoad(Ty, A, MinAlign); + LoadInst *LB = B.CreateAlignedLoad(Ty, Bv, MinAlign); + Value *V = B.CreateSelect(Sel.getCondition(), LA, LB); + + (void)B.CreateAlignedStore(V, Dst, DstAlign); + + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) to " + "value-select loads/stores: " + << MT << '\n'); + MT.eraseFromParent(); + return true; + } + + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Falling back to CFG split for " + << "memcpy(select-src); speculation unsafe\n"); + splitCFGForMemcpy(MT, Sel.getCondition(), A, Bv, true); + LLVM_DEBUG( + dbgs() + << "[AMDGPUVectorIdiom] Rewrote memcpy(select-src) by CFG split\n"); + return true; + } + + // Rewrites memcpy when the destination is a select of pointers. To avoid + // speculative stores, always splits the CFG and emits a memcpy per branch. + // Assumptions mirror the source case. + bool transformSelectMemcpyDest(MemCpyInst &MT, SelectInst &Sel) { + Value *DA = Sel.getTrueValue(); + Value *DB = Sel.getFalseValue(); + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Rewriting memcpy(select-dst) via " + << "CFG split to avoid speculative stores: " << MT + << '\n'); + + splitCFGForMemcpy(MT, Sel.getCondition(), DA, DB, false); + LLVM_DEBUG( + dbgs() + << "[AMDGPUVectorIdiom] Rewrote memcpy(select-dst) by CFG split\n"); + return true; + } + + // Splits the CFG around a memcpy whose source or destination depends on a + // condition. Clones memcpy in then/else using TruePtr/FalsePtr and rejoins. + // Assumptions: + // - MT has constant length and is non-volatile. + // - TruePtr/FalsePtr are correct replacements for the selected operand. + void splitCFGForMemcpy(MemCpyInst &MT, Value *Cond, Value *TruePtr, + Value *FalsePtr, bool IsSource) { + CFGChanged = true; + + Function *F = MT.getFunction(); + BasicBlock *Cur = MT.getParent(); + BasicBlock *ThenBB = BasicBlock::Create(F->getContext(), "memcpy.then", F); + BasicBlock *ElseBB = BasicBlock::Create(F->getContext(), "memcpy.else", F); + BasicBlock *JoinBB = + Cur->splitBasicBlock(BasicBlock::iterator(&MT), "memcpy.join"); + + Cur->getTerminator()->eraseFromParent(); + IRBuilder<> B(Cur); + B.CreateCondBr(Cond, ThenBB, ElseBB); + + ConstantInt *LenCI = cast(MT.getLength()); + + IRBuilder<> BT(ThenBB); + if (IsSource) { + (void)BT.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), TruePtr, + MT.getSourceAlign(), LenCI, MT.isVolatile()); + } else { + (void)BT.CreateMemCpy(TruePtr, MT.getDestAlign(), MT.getRawSource(), + MT.getSourceAlign(), LenCI, MT.isVolatile()); + } + BT.CreateBr(JoinBB); + + IRBuilder<> BE(ElseBB); + if (IsSource) { + (void)BE.CreateMemCpy(MT.getRawDest(), MT.getDestAlign(), FalsePtr, + MT.getSourceAlign(), LenCI, MT.isVolatile()); + } else { + (void)BE.CreateMemCpy(FalsePtr, MT.getDestAlign(), MT.getRawSource(), + MT.getSourceAlign(), LenCI, MT.isVolatile()); + } + BE.CreateBr(JoinBB); + + MT.eraseFromParent(); + } +}; + +} // end anonymous namespace + +AMDGPUVectorIdiomCombinePass::AMDGPUVectorIdiomCombinePass(unsigned MaxBytes) + : MaxBytes(MaxBytes) {} + +// Pass driver that locates small, constant-size, non-volatile memcpy calls +// where source or destination is a select in the same address space. Applies +// the source/destination transforms described above. Intended to run early to +// maximize SROA and subsequent optimizations. +PreservedAnalyses +AMDGPUVectorIdiomCombinePass::run(Function &F, FunctionAnalysisManager &FAM) { + const DataLayout &DL = F.getParent()->getDataLayout(); + auto &DT = FAM.getResult(F); + auto &AC = FAM.getResult(F); + + if (!AMDGPUVectorIdiomEnable) + return PreservedAnalyses::all(); + + SmallVector Worklist; + for (Instruction &I : instructions(F)) { + if (auto *MC = dyn_cast(&I)) + Worklist.push_back(MC); + } + + bool Changed = false; + AMDGPUVectorIdiomImpl Impl(MaxBytes); + + for (MemCpyInst *MT : Worklist) { + Value *Dst = MT->getRawDest(); + Value *Src = MT->getRawSource(); + if (!isa(Src) && !isa(Dst)) + continue; + + LLVM_DEBUG({ + Value *DstV = MT->getRawDest(); + Value *SrcV = MT->getRawSource(); + unsigned DstAS = cast(DstV->getType())->getAddressSpace(); + unsigned SrcAS = cast(SrcV->getType())->getAddressSpace(); + Value *LenV = MT->getLength(); + + auto dumpPtrForms = [&](StringRef Label, Value *V) { + dbgs() << " " << Label << ": " << *V << '\n'; + + Value *StripCasts = V->stripPointerCasts(); + if (StripCasts != V) + dbgs() << " - stripCasts: " << *StripCasts << '\n'; + else + dbgs() << " - stripCasts: (no change)\n"; + + Value *Underlying = getUnderlyingObject(V); + if (Underlying != V) + dbgs() << " - underlying: " << *Underlying << '\n'; + else + dbgs() << " - underlying: (no change)\n"; + }; + + auto dumpSelect = [&](StringRef Which, Value *V) { + if (auto *SI = dyn_cast(V)) { + dbgs() << " - " << Which << " is Select: " << *SI << '\n'; + dbgs() << " cond: " << *SI->getCondition() << '\n'; + Value *T = SI->getTrueValue(); + Value *Fv = SI->getFalseValue(); + dumpPtrForms("true", T); + dumpPtrForms("false", Fv); + dbgs() << " trueIsAlloca=" << (hasAllocaUnderlyingObject(T) ? "true" : "false") << '\n'; + dbgs() << " falseIsAlloca=" << (hasAllocaUnderlyingObject(Fv) ? "true" : "false") << '\n'; + } + }; + + dbgs() << "[AMDGPUVectorIdiom] Found memcpy: " << *MT << '\n' + << " in function: " << F.getName() << '\n' + << " - volatile=" << (MT->isVolatile() ? "true" : "false") << '\n' + << " - sameAS=" << (DstAS == SrcAS ? "true" : "false") + << " (dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n" + << " - constLen=" << (isa(LenV) ? "true" : "false"); + if (auto *LCI = dyn_cast(LenV)) + dbgs() << " (N=" << LCI->getLimitedValue() << ")"; + dbgs() << '\n' + << " - srcIsSelect=" << (isa(SrcV) ? "true" : "false") + << '\n' + << " - dstIsSelect=" << (isa(DstV) ? "true" : "false") + << '\n' + << " - srcIsAlloca=" << (hasAllocaUnderlyingObject(SrcV) ? "true" : "false") + << '\n' + << " - dstIsAlloca=" << (hasAllocaUnderlyingObject(DstV) ? "true" : "false") + << '\n'; + + dumpSelect("src", SrcV); + dumpSelect("dst", DstV); + }); + + if (MT->isVolatile()) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy is volatile\n"); + continue; + } + + ConstantInt *LenCI = dyn_cast(MT->getLength()); + if (!LenCI) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy length is not a " + << "constant integer\n"); + continue; + } + + uint64_t N = LenCI->getLimitedValue(); + if (N == 0 || N > MaxBytes) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: memcpy size out of range " + << "(N=" << N << ", MaxBytes=" << MaxBytes << ")\n"); + continue; + } + + unsigned DstAS = cast(Dst->getType())->getAddressSpace(); + unsigned SrcAS = cast(Src->getType())->getAddressSpace(); + if (DstAS != SrcAS) { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: address space mismatch " + << "(dstAS=" << DstAS << ", srcAS=" << SrcAS << ")\n"); + continue; + } + + // Check if we have select instructions and if their operands are alloca-based + bool ShouldTransform = false; + if (auto *Sel = dyn_cast(Src)) { + bool TrueIsAlloca = hasAllocaUnderlyingObject(Sel->getTrueValue()); + bool FalseIsAlloca = hasAllocaUnderlyingObject(Sel->getFalseValue()); + if (TrueIsAlloca || FalseIsAlloca) { + ShouldTransform = true; + Changed |= Impl.transformSelectMemcpySource(*MT, *Sel, DL, &DT, &AC); + } else { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: select source operands " + << "are not alloca-based\n"); + } + continue; + } + if (auto *Sel = dyn_cast(Dst)) { + bool TrueIsAlloca = hasAllocaUnderlyingObject(Sel->getTrueValue()); + bool FalseIsAlloca = hasAllocaUnderlyingObject(Sel->getFalseValue()); + if (TrueIsAlloca || FalseIsAlloca) { + ShouldTransform = true; + Changed |= Impl.transformSelectMemcpyDest(*MT, *Sel); + } else { + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: select destination operands " + << "are not alloca-based\n"); + } + continue; + } + + LLVM_DEBUG(dbgs() << "[AMDGPUVectorIdiom] Skip: neither source nor " + << "destination is a select of pointers\n"); + } + + if (!Changed) + return PreservedAnalyses::all(); + + // Be conservative: preserve only analyses we know remain valid. + PreservedAnalyses PA; + PA.preserve(); + PA.preserve(); + PA.preserve(); + + // If we didn't change the CFG, we can keep DT/LI/PostDT. + if (!Impl.CFGChanged) { + PA.preserve(); + PA.preserve(); + PA.preserve(); + } + + return PA; +} diff --git a/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h new file mode 100644 index 0000000000000..339a604a092a0 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/AMDGPUVectorIdiom.h @@ -0,0 +1,43 @@ +//===- AMDGPUVectorIdiom.h --------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// AMDGPU-specific vector idiom canonicalizations to unblock SROA and +// subsequent scalarization/vectorization. +// +// This pass rewrites memcpy with select-fed operands into either: +// - a value-level select (two loads + select + store), when safe to +// speculatively load both arms, or +// - a conservative CFG split around the condition to isolate each arm. +// +// Run this pass early, before SROA. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H +#define LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H + +#include "AMDGPU.h" +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class AMDGPUVectorIdiomCombinePass + : public PassInfoMixin { + unsigned MaxBytes; + +public: + /// \p MaxBytes is max memcpy size (in bytes) to transform in + /// AMDGPUVectorIdiom + AMDGPUVectorIdiomCombinePass(unsigned MaxBytes); + + PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM); +}; + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPVECTORIDIOM_H diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index aae56eef73edd..a90512d77c288 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -119,6 +119,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUTargetTransformInfo.cpp AMDGPUWaitSGPRHazards.cpp AMDGPUUnifyDivergentExitNodes.cpp + AMDGPUVectorIdiom.cpp R600MachineCFGStructurizer.cpp GCNCreateVOPD.cpp GCNDPPCombine.cpp diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll new file mode 100644 index 0000000000000..b866ba643562e --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-vector-idiom-memcpy-select.ll @@ -0,0 +1,391 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -amdgpu-vector-idiom-enable -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-vector-idiom -S %s | FileCheck %s + +; This test verifies the AMDGPUVectorIdiomCombinePass transforms: +; 1) memcpy with select-fed source into a value-level select between two loads, +; followed by one store (when it's safe to speculate both loads). +; 2) memcpy with select-fed destination into a control-flow split with two memcpys. + +@G0 = addrspace(1) global [4 x i32] zeroinitializer, align 16 +@G1 = addrspace(1) global [4 x i32] zeroinitializer, align 16 + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1 immarg) +declare void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) nocapture writeonly, ptr addrspace(5) nocapture readonly, i64, i1 immarg) + +; ----------------------------------------------------------------------------- +; Source is a select. Expect value-level select of two <4 x i32> loads +; and a single store, with no remaining memcpy. +; +define amdgpu_kernel void @value_select_src(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @value_select_src( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PA:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[PB:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[DST:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]] +; CHECK-NEXT: [[LA:%.*]] = load <4 x i32>, ptr addrspace(5) [[PA]], align 16 +; CHECK-NEXT: [[LB:%.*]] = load <4 x i32>, ptr addrspace(5) [[PB]], align 16 +; CHECK-NEXT: [[SEL:%.*]] = select i1 [[COND]], <4 x i32> [[LA]], <4 x i32> [[LB]] +; CHECK-NEXT: store <4 x i32> [[SEL]], ptr addrspace(5) [[DST]], align 16 +; CHECK-NEXT: ret void +; +entry: + ; Pointers to two 16-byte aligned buffers using alloca. + %pa = alloca [4 x i32], align 16, addrspace(5) + %pb = alloca [4 x i32], align 16, addrspace(5) + %dst = alloca [4 x i32], align 16, addrspace(5) + %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb + + ; Provide explicit operand alignments so the pass can emit an aligned store. + call void @llvm.memcpy.p5.p5.i64( + ptr addrspace(5) align 16 %dst, + ptr addrspace(5) align 16 %src, + i64 16, i1 false) + + ret void +} + +; ----------------------------------------------------------------------------- +; Destination is a select. Expect CFG split with two memcpys guarded +; by a branch (we do not speculate stores in this pass). +; +define amdgpu_kernel void @dest_select_cfg_split(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @dest_select_cfg_split( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DA:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[DB:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[SRC:%.*]] = alloca [4 x i32], align 16, addrspace(5) +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(5) [[DA]], ptr addrspace(5) [[DB]] +; CHECK-NEXT: br i1 [[COND]], label %[[MEMCPY_THEN:.*]], label %[[MEMCPY_ELSE:.*]] +; CHECK: [[MEMCPY_JOIN:.*]]: +; CHECK-NEXT: ret void +; CHECK: [[MEMCPY_THEN]]: +; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DA]], ptr addrspace(5) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: br label %[[MEMCPY_JOIN]] +; CHECK: [[MEMCPY_ELSE]]: +; CHECK-NEXT: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) [[DB]], ptr addrspace(5) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: br label %[[MEMCPY_JOIN]] +; +entry: + %da = alloca [4 x i32], align 16, addrspace(5) + %db = alloca [4 x i32], align 16, addrspace(5) + %src = alloca [4 x i32], align 16, addrspace(5) + %dst = select i1 %cond, ptr addrspace(5) %da, ptr addrspace(5) %db + call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) %dst, ptr addrspace(5) %src, i64 16, i1 false) + ret void +} + +; ----------------------------------------------------------------------------- +; Source is a select, 4 x double (32 bytes). +; Expect value-level select of two <4 x i64> loads and a single store, no memcpy. +; +@G2 = addrspace(1) global [4 x double] zeroinitializer, align 32 +@G3 = addrspace(1) global [4 x double] zeroinitializer, align 32 +define amdgpu_kernel void @value_select_src_4xd(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @value_select_src_4xd( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PA:%.*]] = alloca [4 x double], align 32, addrspace(5) +; CHECK-NEXT: [[PB:%.*]] = alloca [4 x double], align 32, addrspace(5) +; CHECK-NEXT: [[DST:%.*]] = alloca [4 x double], align 32, addrspace(5) +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr addrspace(5) [[PA]], align 32 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(5) [[PB]], align 32 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND]], <4 x i64> [[TMP0]], <4 x i64> [[TMP1]] +; CHECK-NEXT: store <4 x i64> [[TMP2]], ptr addrspace(5) [[DST]], align 32 +; CHECK-NEXT: ret void +; +entry: + %pa = alloca [4 x double], align 32, addrspace(5) + %pb = alloca [4 x double], align 32, addrspace(5) + %dst = alloca [4 x double], align 32, addrspace(5) + %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb + + call void @llvm.memcpy.p5.p5.i64( + ptr addrspace(5) align 32 %dst, + ptr addrspace(5) align 32 %src, + i64 32, i1 false) + + ret void +} + +; ----------------------------------------------------------------------------- +; Source is a select, 3 x char (3 bytes). +; Expect value-level select using <3 x i8> loads/stores, no memcpy. +; +@G4 = addrspace(1) global [3 x i8] zeroinitializer, align 1 +@G5 = addrspace(1) global [3 x i8] zeroinitializer, align 1 +define amdgpu_kernel void @value_select_src_3xc(i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @value_select_src_3xc( +; CHECK-SAME: i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[PA:%.*]] = alloca [3 x i8], align 1, addrspace(5) +; CHECK-NEXT: [[PB:%.*]] = alloca [3 x i8], align 1, addrspace(5) +; CHECK-NEXT: [[DST:%.*]] = alloca [3 x i8], align 1, addrspace(5) +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(5) [[PA]], ptr addrspace(5) [[PB]] +; CHECK-NEXT: [[TMP0:%.*]] = load <3 x i8>, ptr addrspace(5) [[PA]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x i8>, ptr addrspace(5) [[PB]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[COND]], <3 x i8> [[TMP0]], <3 x i8> [[TMP1]] +; CHECK-NEXT: store <3 x i8> [[TMP2]], ptr addrspace(5) [[DST]], align 1 +; CHECK-NEXT: ret void +; +entry: + %pa = alloca [3 x i8], align 1, addrspace(5) + %pb = alloca [3 x i8], align 1, addrspace(5) + %dst = alloca [3 x i8], align 1, addrspace(5) + %src = select i1 %cond, ptr addrspace(5) %pa, ptr addrspace(5) %pb + + call void @llvm.memcpy.p5.p5.i64( + ptr addrspace(5) align 1 %dst, + ptr addrspace(5) align 1 %src, + i64 3, i1 false) + + ret void +} + +; ----------------------------------------------------------------------------- +; Source is a select with constant expression GEP arms. +; Expect value-level select: two loads + select + store, no memcpy. +; +@GEPA = addrspace(1) global [4 x i32] zeroinitializer, align 16 +@GEPB = addrspace(1) global [4 x i32] zeroinitializer, align 16 +define amdgpu_kernel void @value_select_src_constexpr_gep(ptr addrspace(1) %dst, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @value_select_src_constexpr_gep( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) align 16 [[DST]], ptr addrspace(1) align 16 [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + ; Constant expression GEPs to the base elements + %src = select i1 %cond, + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0), + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0) + + call void @llvm.memcpy.p1.p1.i64( + ptr addrspace(1) align 16 %dst, + ptr addrspace(1) align 16 %src, + i64 16, i1 false) + + ret void +} +; ----------------------------------------------------------------------------- +; Destination is a select with constant expression GEP arms. +; Expect CFG split with two memcpys. +; +define amdgpu_kernel void @dest_select_constexpr_gep(ptr addrspace(1) %src, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @dest_select_constexpr_gep( +; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) @GEPA, ptr addrspace(1) @GEPB +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %dst = select i1 %cond, + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPA, i64 0, i64 0), + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GEPB, i64 0, i64 0) + + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Source is a select where one arm is null. +; Expect CFG split (no speculative loads). +; +@GN = addrspace(1) global [4 x i32] zeroinitializer, align 16 +define amdgpu_kernel void @src_select_null_arm(ptr addrspace(1) %dst, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @src_select_null_arm( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0 +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) null +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GN, i64 0, i64 0 + %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) null + + call void @llvm.memcpy.p1.p1.i64( + ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Destination is a select where one arm is null. +; Expect CFG split (no speculative stores). +; +define amdgpu_kernel void @dst_select_null_arm(ptr addrspace(1) %src, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @dst_select_null_arm( +; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) null, ptr addrspace(1) @GN +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %dst = select i1 %cond, ptr addrspace(1) null, + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GN, i64 0, i64 0) + + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Source is a select where one arm is poison. +; Expect CFG split (speculative use of poison is not allowed). +; +@GP = addrspace(1) global [4 x i32] zeroinitializer, align 16 +define amdgpu_kernel void @src_select_poison_arm(ptr addrspace(1) %dst, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @src_select_poison_arm( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[NONNULL:%.*]] = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0 +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[NONNULL]], ptr addrspace(1) poison +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %nonnull = getelementptr inbounds [4 x i32], ptr addrspace(1) @GP, i64 0, i64 0 + %src = select i1 %cond, ptr addrspace(1) %nonnull, ptr addrspace(1) poison + + call void @llvm.memcpy.p1.p1.i64( + ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Destination is a select where one arm is poison. +; Expect CFG split. +; +define amdgpu_kernel void @dst_select_poison_arm(ptr addrspace(1) %src, i1 %cond) { +; CHECK-LABEL: define amdgpu_kernel void @dst_select_poison_arm( +; CHECK-SAME: ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) poison, ptr addrspace(1) @GP +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +entry: + %dst = select i1 %cond, ptr addrspace(1) poison, + ptr addrspace(1) getelementptr inbounds ([4 x i32], ptr addrspace(1) @GP, i64 0, i64 0) + + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, ptr addrspace(1) %src, i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Non-constant memcpy length: the pass should not transform. +; Expect: memcpy remains as-is (no load/select/store, no CFG split). +; +define amdgpu_kernel void @memcpy_nonconst_length_src_select(ptr addrspace(1) %dst, +; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_src_select( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false) +; CHECK-NEXT: ret void +; + ptr addrspace(1) %pa, + ptr addrspace(1) %pb, + i1 %cond, i64 %n) { +entry: + %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, + ptr addrspace(1) %src, + i64 %n, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; Non-constant memcpy length with destination select: pass should not transform. +; Expect: memcpy remains, no CFG split. +; +define amdgpu_kernel void @memcpy_nonconst_length_dst_select(ptr addrspace(1) %da, +; CHECK-LABEL: define amdgpu_kernel void @memcpy_nonconst_length_dst_select( +; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], ptr addrspace(1) [[SRC:%.*]], i1 [[COND:%.*]], i64 [[N:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 [[N]], i1 false) +; CHECK-NEXT: ret void +; + ptr addrspace(1) %db, + ptr addrspace(1) %src, + i1 %cond, i64 %n) { +entry: + %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, + ptr addrspace(1) %src, + i64 %n, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; "Non-constant source" scenario: select arms are function args (not globals). +; No strong dereferenceable/align attrs -> speculation should be unsafe, +; so the pass should split CFG and materialize two memcpys. +; +define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split(ptr addrspace(1) %dst, +; CHECK-LABEL: define amdgpu_kernel void @memcpy_src_select_arg_arms_cfg_split( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]] +; CHECK-NEXT: call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + ptr addrspace(1) %pa, + ptr addrspace(1) %pb, + i1 %cond) { +entry: + %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb + call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) %dst, + ptr addrspace(1) %src, + i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; memmove should be ignored by the pass even with select-fed source/dest. +; Expect: memmove remains as-is (no CFG split, no speculative transform). +; +declare void @llvm.memmove.p1.p1.i64(ptr addrspace(1) nocapture writeonly, + ptr addrspace(1) nocapture readonly, + i64, i1 immarg) + +define amdgpu_kernel void @memmove_ignored_src_select(ptr addrspace(1) %dst, +; CHECK-LABEL: define amdgpu_kernel void @memmove_ignored_src_select( +; CHECK-SAME: ptr addrspace(1) [[DST:%.*]], ptr addrspace(1) [[PA:%.*]], ptr addrspace(1) [[PB:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SRC:%.*]] = select i1 [[COND]], ptr addrspace(1) [[PA]], ptr addrspace(1) [[PB]] +; CHECK-NEXT: call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) [[DST]], ptr addrspace(1) [[SRC]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + ptr addrspace(1) %pa, + ptr addrspace(1) %pb, + i1 %cond) { +entry: + %src = select i1 %cond, ptr addrspace(1) %pa, ptr addrspace(1) %pb + call void @llvm.memmove.p1.p1.i64(ptr addrspace(1) %dst, + ptr addrspace(1) %src, + i64 16, i1 false) + ret void +} +; ----------------------------------------------------------------------------- +; memset should be ignored by the pass, even if destination is a select. +; Expect: memset remains as-is (no CFG split). +; +declare void @llvm.memset.p1.i64(ptr addrspace(1) nocapture writeonly, + i8, i64, i1 immarg) + +define amdgpu_kernel void @memset_ignored_dst_select(ptr addrspace(1) %da, +; CHECK-LABEL: define amdgpu_kernel void @memset_ignored_dst_select( +; CHECK-SAME: ptr addrspace(1) [[DA:%.*]], ptr addrspace(1) [[DB:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = select i1 [[COND]], ptr addrspace(1) [[DA]], ptr addrspace(1) [[DB]] +; CHECK-NEXT: call void @llvm.memset.p1.i64(ptr addrspace(1) [[DST]], i8 0, i64 16, i1 false) +; CHECK-NEXT: ret void +; + ptr addrspace(1) %db, + i1 %cond) { +entry: + %dst = select i1 %cond, ptr addrspace(1) %da, ptr addrspace(1) %db + call void @llvm.memset.p1.i64(ptr addrspace(1) %dst, i8 0, i64 16, i1 false) + ret void +}