diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h index 58bde3e9274666..8d11643da03a0b 100644 --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -573,12 +573,22 @@ class IRBuilderBase { NoAliasTag); } + CallInst *CreateMemTransferInst( + Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, bool isVolatile = false, + MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, + MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + CallInst *CreateMemCpy(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size, bool isVolatile = false, MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, MDNode *ScopeTag = nullptr, - MDNode *NoAliasTag = nullptr); + MDNode *NoAliasTag = nullptr) { + return CreateMemTransferInst(Intrinsic::memcpy, Dst, DstAlign, Src, + SrcAlign, Size, isVolatile, TBAATag, + TBAAStructTag, ScopeTag, NoAliasTag); + } CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, Value *Size); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp index 3319f48b42d7f0..982a158ff5c042 100644 --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -136,22 +136,21 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( return CI; } -CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, MaybeAlign DstAlign, - Value *Src, MaybeAlign SrcAlign, - Value *Size, bool isVolatile, - MDNode *TBAATag, MDNode *TBAAStructTag, - MDNode *ScopeTag, MDNode *NoAliasTag) { +CallInst *IRBuilderBase::CreateMemTransferInst( + Intrinsic::ID IntrID, Value *Dst, MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, bool isVolatile, MDNode *TBAATag, + MDNode *TBAAStructTag, MDNode *ScopeTag, MDNode *NoAliasTag) { Dst = getCastedInt8PtrValue(Dst); Src = getCastedInt8PtrValue(Src); Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; Module *M = BB->getParent()->getParent(); - Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys); + Function *TheFn = Intrinsic::getDeclaration(M, IntrID, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); - auto* MCI = cast(CI); + auto* MCI = cast(CI); if (DstAlign) MCI->setDestAlignment(*DstAlign); if (SrcAlign) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 8050c575c1f834..b1ee44d1db199c 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -237,37 +237,40 @@ namespace { class PointerReplacer { public: PointerReplacer(InstCombinerImpl &IC) : IC(IC) {} + + bool collectUsers(Instruction &I); void replacePointer(Instruction &I, Value *V); private: - void findLoadAndReplace(Instruction &I); void replace(Instruction *I); Value *getReplacement(Value *I); - SmallVector Path; + SmallSetVector Worklist; MapVector WorkMap; InstCombinerImpl &IC; }; } // end anonymous namespace -void PointerReplacer::findLoadAndReplace(Instruction &I) { +bool PointerReplacer::collectUsers(Instruction &I) { for (auto U : I.users()) { - auto *Inst = dyn_cast(&*U); - if (!Inst) - return; - LLVM_DEBUG(dbgs() << "Found pointer user: " << *U << '\n'); - if (isa(Inst)) { - for (auto P : Path) - replace(P); - replace(Inst); + Instruction *Inst = cast(&*U); + if (LoadInst *Load = dyn_cast(Inst)) { + if (Load->isVolatile()) + return false; + Worklist.insert(Load); } else if (isa(Inst) || isa(Inst)) { - Path.push_back(Inst); - findLoadAndReplace(*Inst); - Path.pop_back(); + Worklist.insert(Inst); + if (!collectUsers(*Inst)) + return false; + } else if (isa(Inst)) { + Worklist.insert(Inst); } else { - return; + LLVM_DEBUG(dbgs() << "Cannot handle pointer user: " << *U << '\n'); + return false; } } + + return true; } Value *PointerReplacer::getReplacement(Value *V) { @@ -309,6 +312,28 @@ void PointerReplacer::replace(Instruction *I) { IC.InsertNewInstWith(NewI, *BC); NewI->takeName(BC); WorkMap[BC] = NewI; + } else if (auto *MemCpy = dyn_cast(I)) { + auto *SrcV = getReplacement(MemCpy->getRawSource()); + // The pointer may appear in the destination of a copy, but we don't want to + // replace it. + if (!SrcV) { + assert(getReplacement(MemCpy->getRawDest()) && + "destination not in replace list"); + return; + } + + IC.Builder.SetInsertPoint(MemCpy); + auto *NewI = IC.Builder.CreateMemTransferInst( + MemCpy->getIntrinsicID(), MemCpy->getRawDest(), MemCpy->getDestAlign(), + SrcV, MemCpy->getSourceAlign(), MemCpy->getLength(), + MemCpy->isVolatile()); + AAMDNodes AAMD; + MemCpy->getAAMetadata(AAMD); + if (AAMD) + NewI->setAAMetadata(AAMD); + + IC.eraseInstFromFunction(*MemCpy); + WorkMap[MemCpy] = NewI; } else { llvm_unreachable("should never reach here"); } @@ -322,7 +347,9 @@ void PointerReplacer::replacePointer(Instruction &I, Value *V) { "Invalid usage"); #endif WorkMap[&I] = V; - findLoadAndReplace(I); + + for (Instruction *Workitem : Worklist) + replace(Workitem); } Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) { @@ -376,23 +403,21 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) { // read. SmallVector ToDelete; if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) { + Value *TheSrc = Copy->getSource(); Align AllocaAlign = AI.getAlign(); Align SourceAlign = getOrEnforceKnownAlignment( - Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT); + TheSrc, AllocaAlign, DL, &AI, &AC, &DT); if (AllocaAlign <= SourceAlign && - isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) { + isDereferenceableForAllocaSize(TheSrc, &AI, DL)) { LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n'); LLVM_DEBUG(dbgs() << " memcpy = " << *Copy << '\n'); - for (unsigned i = 0, e = ToDelete.size(); i != e; ++i) - eraseInstFromFunction(*ToDelete[i]); - Value *TheSrc = Copy->getSource(); - auto *SrcTy = TheSrc->getType(); - auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(), - SrcTy->getPointerAddressSpace()); - Value *Cast = - Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy); - if (AI.getType()->getPointerAddressSpace() == - SrcTy->getPointerAddressSpace()) { + unsigned SrcAddrSpace = TheSrc->getType()->getPointerAddressSpace(); + auto *DestTy = PointerType::get(AI.getAllocatedType(), SrcAddrSpace); + if (AI.getType()->getAddressSpace() == SrcAddrSpace) { + for (Instruction *Delete : ToDelete) + eraseInstFromFunction(*Delete); + + Value *Cast = Builder.CreateBitCast(TheSrc, DestTy); Instruction *NewI = replaceInstUsesWith(AI, Cast); eraseInstFromFunction(*Copy); ++NumGlobalCopies; @@ -400,8 +425,14 @@ Instruction *InstCombinerImpl::visitAllocaInst(AllocaInst &AI) { } PointerReplacer PtrReplacer(*this); - PtrReplacer.replacePointer(AI, Cast); - ++NumGlobalCopies; + if (PtrReplacer.collectUsers(AI)) { + for (Instruction *Delete : ToDelete) + eraseInstFromFunction(*Delete); + + Value *Cast = Builder.CreateBitCast(TheSrc, DestTy); + PtrReplacer.replacePointer(AI, Cast); + ++NumGlobalCopies; + } } } diff --git a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll index c49c5a697aed07..2625f6cf22a5c5 100644 --- a/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll +++ b/llvm/test/Transforms/InstCombine/AMDGPU/memcpy-from-constant.ll @@ -23,6 +23,23 @@ define i8 @memcpy_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias rea ret i8 %load } +; Simple memmove to alloca from constant address space argument. +define i8 @memmove_constant_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { +; CHECK-LABEL: @memmove_constant_arg_ptr_to_alloca( +; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 +; CHECK-NEXT: [[GEP:%.*]] = getelementptr [32 x i8], [32 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 +; CHECK-NEXT: ret i8 [[LOAD]] +; + %alloca = alloca [32 x i8], align 4, addrspace(5) + %alloca.cast = bitcast [32 x i8] addrspace(5)* %alloca to i8 addrspace(5)* + %arg.cast = bitcast [32 x i8] addrspace(4)* %arg to i8 addrspace(4)* + call void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* %alloca.cast, i8 addrspace(4)* %arg.cast, i32 32, i1 false) + %gep = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* %alloca, i32 0, i32 %idx + %load = load i8, i8 addrspace(5)* %gep + ret i8 %load +} + ; Simple memcpy to alloca from byref constant address space argument. define amdgpu_kernel void @memcpy_constant_byref_arg_ptr_to_alloca([32 x i8] addrspace(4)* noalias readonly align 4 byref([32 x i8]) %arg, i8 addrspace(1)* %out, i32 %idx) { ; CHECK-LABEL: @memcpy_constant_byref_arg_ptr_to_alloca( @@ -87,9 +104,13 @@ define amdgpu_kernel void @memcpy_constant_intrinsic_ptr_to_alloca(i8 addrspace( ; Alloca is written through a flat pointer define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat([31 x i8] addrspace(4)* noalias readonly align 4 dereferenceable(32) %arg, i32 %idx) { ; CHECK-LABEL: @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat( -; CHECK-NEXT: [[TMP1:%.*]] = sext i32 [[IDX:%.*]] to i64 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 [[TMP1]] -; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(4)* [[GEP]], align 1 +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(5) +; CHECK-NEXT: [[ALLOCA_CAST:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 0 +; CHECK-NEXT: [[ALLOCA_CAST_ASC:%.*]] = addrspacecast i8 addrspace(5)* [[ALLOCA_CAST]] to i8* +; CHECK-NEXT: [[ARG_CAST:%.*]] = getelementptr inbounds [31 x i8], [31 x i8] addrspace(4)* [[ARG:%.*]], i64 0, i64 0 +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 1 dereferenceable(31) [[ALLOCA_CAST_ASC]], i8 addrspace(4)* align 4 dereferenceable(31) [[ARG_CAST]], i64 31, i1 false) +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [32 x i8], [32 x i8] addrspace(5)* [[ALLOCA]], i32 0, i32 [[IDX:%.*]] +; CHECK-NEXT: [[LOAD:%.*]] = load i8, i8 addrspace(5)* [[GEP]], align 1 ; CHECK-NEXT: ret i8 [[LOAD]] ; %alloca = alloca [32 x i8], align 4, addrspace(5) @@ -125,9 +146,88 @@ define i8 @memcpy_constant_arg_ptr_to_alloca_addrspacecast_to_flat2([32 x i8] ad ret i8 %load } -declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 +%struct.ty = type { [4 x i32] } + +define amdgpu_kernel void @byref_infloop(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { +; CHECK-LABEL: @byref_infloop( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false) +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)* + %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false) + call void @llvm.memcpy.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false) + ret void +} + +define amdgpu_kernel void @byref_infloop_metadata(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { +; CHECK-LABEL: @byref_infloop_metadata( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false), !noalias !0 +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)* + %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false), !noalias !0 + call void @llvm.memcpy.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false), !noalias !1 + ret void +} + +define amdgpu_kernel void @byref_infloop_addrspacecast(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { +; CHECK-LABEL: @byref_infloop_addrspacecast( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[ALLOCA:%.*]] = alloca [4 x i32], align 4, addrspace(5) +; CHECK-NEXT: [[CAST_ARG:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [4 x i32] addrspace(5)* [[ALLOCA]] to i8 addrspace(5)* +; CHECK-NEXT: [[ADDRSPACECAST_ALLOCA:%.*]] = addrspacecast i8 addrspace(5)* [[CAST_ALLOCA]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ARG]], i64 16, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8* nonnull align 4 dereferenceable(16) [[ADDRSPACECAST_ALLOCA]], i64 16, i1 false) +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)* + %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + %addrspacecast.alloca = addrspacecast i8 addrspace(5)* %cast.alloca to i8* + call void @llvm.memcpy.p0i8.p4i8.i64(i8* nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i8 addrspace(4)* align 4 dereferenceable(16) %cast.arg, i64 16, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %scratch, i8* nonnull align 4 dereferenceable(16) %addrspacecast.alloca, i64 16, i1 false) + ret void +} + +define amdgpu_kernel void @byref_infloop_memmove(i8* %scratch, %struct.ty addrspace(4)* byref(%struct.ty) align 4 %arg) local_unnamed_addr #1 { +; CHECK-LABEL: @byref_infloop_memmove( +; CHECK-NEXT: bb: +; CHECK-NEXT: [[CAST_ALLOCA:%.*]] = bitcast [[STRUCT_TY:%.*]] addrspace(4)* [[ARG:%.*]] to i8 addrspace(4)* +; CHECK-NEXT: call void @llvm.memmove.p0i8.p4i8.i32(i8* nonnull align 4 dereferenceable(16) [[SCRATCH:%.*]], i8 addrspace(4)* align 4 dereferenceable(16) [[CAST_ALLOCA]], i32 16, i1 false) +; CHECK-NEXT: ret void +; +bb: + %alloca = alloca [4 x i32], align 4, addrspace(5) + %cast.arg = bitcast %struct.ty addrspace(4)* %arg to i8 addrspace(4)* + %cast.alloca = bitcast [4 x i32] addrspace(5)* %alloca to i8 addrspace(5)* + call void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* align 4 %cast.alloca, i8 addrspace(4)* align 4 %cast.arg, i32 16, i1 false) + call void @llvm.memmove.p0i8.p5i8.i32(i8* align 4 %scratch, i8 addrspace(5)* align 4 %cast.alloca, i32 16, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p5i8.i32(i8* noalias nocapture writeonly, i8 addrspace(5)* noalias nocapture readonly, i32, i1 immarg) #0 +declare void @llvm.memcpy.p5i8.p4i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i32, i1) #0 declare void @llvm.memcpy.p0i8.p4i8.i64(i8* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0 +declare void @llvm.memcpy.p5i8.p4i8.i64(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i64, i1) #0 +declare void @llvm.memmove.p5i8.p4i8.i32(i8 addrspace(5)* nocapture, i8 addrspace(4)* nocapture, i32, i1) #0 +declare void @llvm.memmove.p0i8.p5i8.i32(i8* nocapture, i8 addrspace(5)* nocapture, i32, i1) #0 declare i8 addrspace(4)* @llvm.amdgcn.kernarg.segment.ptr() #1 attributes #0 = { argmemonly nounwind willreturn } attributes #1 = { nounwind readnone speculatable } + +!0 = !{!0} +!1 = !{!1}