diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 77db14513254f..d8e8c8f024518 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -502,27 +502,14 @@ static Value *promoteAllocaUserToVector( Instruction *Inst, const DataLayout &DL, FixedVectorType *VectorTy, unsigned VecStoreSize, unsigned ElementSize, DenseMap &TransferInfo, - std::map &GEPVectorIdx, Value *CurVal, - SmallVectorImpl &DeferredLoads) { + std::map &GEPVectorIdx, + function_ref GetCurVal) { // Note: we use InstSimplifyFolder because it can leverage the DataLayout // to do more folding, especially in the case of vector splats. IRBuilder Builder(Inst->getContext(), InstSimplifyFolder(DL)); Builder.SetInsertPoint(Inst); - const auto GetOrLoadCurrentVectorValue = [&]() -> Value * { - if (CurVal) - return CurVal; - - // If the current value is not known, insert a dummy load and lower it on - // the second pass. - LoadInst *Dummy = - Builder.CreateLoad(VectorTy, PoisonValue::get(Builder.getPtrTy()), - "promotealloca.dummyload"); - DeferredLoads.push_back(Dummy); - return Dummy; - }; - const auto CreateTempPtrIntCast = [&Builder, DL](Value *Val, Type *PtrTy) -> Value * { assert(DL.getTypeStoreSize(Val->getType()) == DL.getTypeStoreSize(PtrTy)); @@ -542,12 +529,7 @@ static Value *promoteAllocaUserToVector( switch (Inst->getOpcode()) { case Instruction::Load: { - // Loads can only be lowered if the value is known. - if (!CurVal) { - DeferredLoads.push_back(cast(Inst)); - return nullptr; - } - + Value *CurVal = GetCurVal(); Value *Index = calculateVectorIndex( cast(Inst)->getPointerOperand(), GEPVectorIdx); @@ -637,7 +619,7 @@ static Value *promoteAllocaUserToVector( Val = Builder.CreateBitOrPointerCast(Val, SubVecTy); - Value *CurVec = GetOrLoadCurrentVectorValue(); + Value *CurVec = GetCurVal(); for (unsigned K = 0, NumElts = std::min(NumWrittenElts, NumVecElts); K < NumElts; ++K) { Value *CurIdx = @@ -650,8 +632,7 @@ static Value *promoteAllocaUserToVector( if (Val->getType() != VecEltTy) Val = Builder.CreateBitOrPointerCast(Val, VecEltTy); - return Builder.CreateInsertElement(GetOrLoadCurrentVectorValue(), Val, - Index); + return Builder.CreateInsertElement(GetCurVal(), Val, Index); } case Instruction::Call: { if (auto *MTI = dyn_cast(Inst)) { @@ -673,7 +654,7 @@ static Value *promoteAllocaUserToVector( } } - return Builder.CreateShuffleVector(GetOrLoadCurrentVectorValue(), Mask); + return Builder.CreateShuffleVector(GetCurVal(), Mask); } if (auto *MSI = dyn_cast(Inst)) { @@ -1038,37 +1019,46 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { Updater.AddAvailableValue(EntryBB, AllocaInitValue); - // First handle the initial worklist. - SmallVector DeferredLoads; + // First handle the initial worklist, in basic block order. + // + // Insert a placeholder whenever we need the vector value at the top of a + // basic block. + SmallVector Placeholders; forEachWorkListItem(WorkList, [&](Instruction *I) { BasicBlock *BB = I->getParent(); - // On the first pass, we only take values that are trivially known, i.e. - // where AddAvailableValue was already called in this block. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.FindValueForBlock(BB), DeferredLoads); + auto GetCurVal = [&]() -> Value * { + if (Value *CurVal = Updater.FindValueForBlock(BB)) + return CurVal; + + if (!Placeholders.empty() && Placeholders.back()->getParent() == BB) + return Placeholders.back(); + + // If the current value in the basic block is not yet known, insert a + // placeholder that we will replace later. + IRBuilder<> Builder(I); + auto *Placeholder = cast(Builder.CreateFreeze( + PoisonValue::get(VectorTy), "promotealloca.placeholder")); + Placeholders.push_back(Placeholder); + return Placeholders.back(); + }; + + Value *Result = + promoteAllocaUserToVector(I, *DL, VectorTy, VecStoreSize, ElementSize, + TransferInfo, GEPVectorIdx, GetCurVal); if (Result) Updater.AddAvailableValue(BB, Result); }); - // Then handle deferred loads. - forEachWorkListItem(DeferredLoads, [&](Instruction *I) { - SmallVector NewDLs; - BasicBlock *BB = I->getParent(); - // On the second pass, we use GetValueInMiddleOfBlock to guarantee we always - // get a value, inserting PHIs as needed. - Value *Result = promoteAllocaUserToVector( - I, *DL, VectorTy, VecStoreSize, ElementSize, TransferInfo, GEPVectorIdx, - Updater.GetValueInMiddleOfBlock(I->getParent()), NewDLs); - if (Result) - Updater.AddAvailableValue(BB, Result); - assert(NewDLs.empty() && "No more deferred loads should be queued!"); - }); + // Now fixup the placeholders. + for (Instruction *Placeholder : Placeholders) { + Placeholder->replaceAllUsesWith( + Updater.GetValueInMiddleOfBlock(Placeholder->getParent())); + Placeholder->eraseFromParent(); + } // Delete all instructions. On the first pass, new dummy loads may have been // added so we need to collect them too. DenseSet InstsToDelete(WorkList.begin(), WorkList.end()); - InstsToDelete.insert_range(DeferredLoads); for (Instruction *I : InstsToDelete) { assert(I->use_empty()); I->eraseFromParent(); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll new file mode 100644 index 0000000000000..10d98fd0a0610 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-cfg.ll @@ -0,0 +1,33 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -passes=amdgpu-promote-alloca < %s | FileCheck %s + +; Checks that certain CFGs are handled correctly. + +define amdgpu_kernel void @chain(i32 %init, i32 %i.1, i32 %i.2) { +; CHECK-LABEL: @chain( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[A:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32 [[INIT:%.*]] to <4 x i8> +; CHECK-NEXT: br label [[BB1:%.*]] +; CHECK: bb1: +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_1:%.*]] +; CHECK-NEXT: br label [[BB2:%.*]] +; CHECK: bb2: +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP0]], i32 [[I_2:%.*]] +; CHECK-NEXT: ret void +; +entry: + %a = alloca [4 x i8], align 4, addrspace(5) + store i32 %init, ptr addrspace(5) %a + br label %bb1 + +bb1: + %p.1 = getelementptr i8, ptr addrspace(5) %a, i32 %i.1 + %x.1 = load i8, ptr addrspace(5) %p.1 + br label %bb2 + +bb2: + %p.2 = getelementptr i8, ptr addrspace(5) %a, i32 %i.2 + %x.2 = load i8, ptr addrspace(5) %p.2 + ret void +}