diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp index 29193e0c541b9..a96c796b18ddf 100644 --- a/clang/lib/CodeGen/CGDecl.cpp +++ b/clang/lib/CodeGen/CGDecl.cpp @@ -1450,6 +1450,14 @@ static uint64_t maxFakeUseAggregateSize(const ASTContext &C) { return 4 * C.getTypeSize(C.UnsignedIntTy); } +static bool checkIsReadOnlyMetadataAvailable(QualType Ty, + const LangOptions &LO) { + bool IsLangSupported = + LO.C99 || LO.C11 || LO.C17 || LO.C23 || LO.C2y || LO.CPlusPlus; + // Currently support only for scalar types + return IsLangSupported && Ty.isConstQualified() && Ty->isScalarType(); +} + // Helper function to determine whether a variable's or parameter's lifetime // should be extended. static bool shouldExtendLifetime(const ASTContext &Context, @@ -1601,9 +1609,10 @@ CodeGenFunction::EmitAutoVarAlloca(const VarDecl &D) { // Create the alloca. Note that we set the name separately from // building the instruction so that it's there even in no-asserts // builds. - address = CreateTempAlloca(allocaTy, Ty.getAddressSpace(), - allocaAlignment, D.getName(), - /*ArraySize=*/nullptr, &AllocaAddr); + address = CreateTempAlloca( + allocaTy, Ty.getAddressSpace(), allocaAlignment, D.getName(), + /*ArraySize=*/nullptr, &AllocaAddr, + checkIsReadOnlyMetadataAvailable(Ty, getLangOpts())); // Don't emit lifetime markers for MSVC catch parameters. The lifetime of // the catch parameter starts in the catchpad instruction, and we can't diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp index e8456a44f8367..a2351d30af394 100644 --- a/clang/lib/CodeGen/CGExpr.cpp +++ b/clang/lib/CodeGen/CGExpr.cpp @@ -99,11 +99,17 @@ static llvm::StringRef GetUBSanTrapForHandler(SanitizerHandler ID) { /// CreateTempAlloca - This creates a alloca and inserts it into the entry /// block. -RawAddress -CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits Align, - const Twine &Name, - llvm::Value *ArraySize) { +RawAddress CodeGenFunction::CreateTempAllocaWithoutCast(llvm::Type *Ty, + CharUnits Align, + const Twine &Name, + llvm::Value *ArraySize, + bool IsReadOnly) { auto Alloca = CreateTempAlloca(Ty, Name, ArraySize); + if (IsReadOnly) { + llvm::MDNode *Node = llvm::MDNode::get( + getLLVMContext(), llvm::ConstantAsMetadata::get(Builder.getInt32(1))); + Alloca->setMetadata(llvm::LLVMContext::MD_immutable, Node); + } Alloca->setAlignment(Align.getAsAlign()); return RawAddress(Alloca, Ty, Align, KnownNonNull); } @@ -138,8 +144,10 @@ RawAddress CodeGenFunction::MaybeCastStackAddressSpace(RawAddress Alloca, RawAddress CodeGenFunction::CreateTempAlloca(llvm::Type *Ty, LangAS DestLangAS, CharUnits Align, const Twine &Name, llvm::Value *ArraySize, - RawAddress *AllocaAddr) { - RawAddress Alloca = CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize); + RawAddress *AllocaAddr, + bool IsReadOnly) { + RawAddress Alloca = + CreateTempAllocaWithoutCast(Ty, Align, Name, ArraySize, IsReadOnly); if (AllocaAddr) *AllocaAddr = Alloca; return MaybeCastStackAddressSpace(Alloca, DestLangAS, ArraySize); diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 123cb4f51f828..c64312ba8e52a 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -2848,7 +2848,8 @@ class CodeGenFunction : public CodeGenTypeCache { RawAddress CreateTempAlloca(llvm::Type *Ty, LangAS UseAddrSpace, CharUnits align, const Twine &Name = "tmp", llvm::Value *ArraySize = nullptr, - RawAddress *Alloca = nullptr); + RawAddress *Alloca = nullptr, + bool IsReadOnly = false); /// CreateTempAlloca - This creates a alloca and inserts it into the entry /// block. The alloca is casted to default address space if necessary. @@ -2865,7 +2866,8 @@ class CodeGenFunction : public CodeGenTypeCache { RawAddress CreateTempAllocaWithoutCast(llvm::Type *Ty, CharUnits align, const Twine &Name = "tmp", - llvm::Value *ArraySize = nullptr); + llvm::Value *ArraySize = nullptr, + bool IsReadOnly = false); /// CreateDefaultAlignedTempAlloca - This creates an alloca with the /// default ABI alignment of the given LLVM type. diff --git a/clang/test/CodeGen/const-alloca.c b/clang/test/CodeGen/const-alloca.c new file mode 100644 index 0000000000000..96b89fc7f8aff --- /dev/null +++ b/clang/test/CodeGen/const-alloca.c @@ -0,0 +1,17 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 %s -emit-llvm -o - | FileCheck %s + +// CHECK-LABEL: define dso_local i32 @test( +// CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[X:%.*]] = alloca i32, align 4, !immutable [[META2:![0-9]+]] +// CHECK-NEXT: store i32 1, ptr [[X]], align 4 +// CHECK-NEXT: ret i32 1 +// +int test() { + const int x = 1; + return x; +} +//. +// CHECK: [[META2]] = !{i32 1} +//. diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl index d71c89811f04b..9dead762a8bd4 100644 --- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl +++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl @@ -129,7 +129,7 @@ kernel void test_target_features_kernel(global int *i) { // NOCPU-NEXT: [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) // NOCPU-NEXT: [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // NOCPU-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5) -// NOCPU-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5) +// NOCPU-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META7:![0-9]+]] // NOCPU-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // NOCPU-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) // NOCPU-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr @@ -235,7 +235,7 @@ kernel void test_target_features_kernel(global int *i) { // // NOCPU: Function Attrs: convergent noinline norecurse nounwind optnone // NOCPU-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( -// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7:![0-9]+]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { +// NOCPU-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR4:[0-9]+]] !kernel_arg_addr_space [[META7]] !kernel_arg_access_qual [[META8:![0-9]+]] !kernel_arg_type [[META9:![0-9]+]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META10:![0-9]+]] { // NOCPU-NEXT: [[ENTRY:.*:]] // NOCPU-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // NOCPU-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr @@ -503,7 +503,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[VARTMP11:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) // GFX900-NEXT: [[BLOCK12:%.*]] = alloca <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, align 8, addrspace(5) // GFX900-NEXT: [[BLOCK_SIZES:%.*]] = alloca [1 x i64], align 8, addrspace(5) -// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5) +// GFX900-NEXT: [[BLOCK20:%.*]] = alloca ptr, align 8, addrspace(5), !immutable [[META17:![0-9]+]] // GFX900-NEXT: [[BLOCK21:%.*]] = alloca <{ i32, i32, ptr, i64, ptr addrspace(1) }>, align 8, addrspace(5) // GFX900-NEXT: [[VARTMP27:%.*]] = alloca [[STRUCT_NDRANGE_T]], align 4, addrspace(5) // GFX900-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr @@ -525,11 +525,11 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: store i64 [[D]], ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9:[0-9]+]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] -// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17:![0-9]+]] +// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18:![0-9]+]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] -// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19:![0-9]+]] -// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21:![0-9]+]] +// GFX900-NEXT: [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20:![0-9]+]] +// GFX900-NEXT: [[TMP1:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22:![0-9]+]] // GFX900-NEXT: [[BLOCK_SIZE:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 25, ptr [[BLOCK_SIZE]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), i8 }>, ptr [[BLOCK_ASCAST]], i32 0, i32 1 @@ -543,9 +543,9 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP3:%.*]] = load i8, ptr [[B_ADDR_ASCAST]], align 1, !tbaa [[TBAA16]] // GFX900-NEXT: store i8 [[TMP3]], ptr [[BLOCK_CAPTURED1]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[TMP4:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP0]], i32 [[TMP1]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_kernel.runtime.handle to ptr), ptr [[BLOCK_ASCAST]]) -// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]] +// GFX900-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP2_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]] // GFX900-NEXT: [[BLOCK_SIZE4:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 41, ptr [[BLOCK_SIZE4]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN5:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK3_ASCAST]], i32 0, i32 1 @@ -565,9 +565,9 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP10:%.*]] = load i64, ptr [[D_ADDR_ASCAST]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: store i64 [[TMP10]], ptr [[BLOCK_CAPTURED10]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[TMP11:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP5]], i32 [[TMP6]], ptr addrspace(5) [[VARTMP2]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_2_kernel.runtime.handle to ptr), ptr [[BLOCK3_ASCAST]]) -// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]] +// GFX900-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP11_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]] // GFX900-NEXT: [[BLOCK_SIZE13:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 0 // GFX900-NEXT: store i32 41, ptr [[BLOCK_SIZE13]], align 8 // GFX900-NEXT: [[BLOCK_ALIGN14:%.*]] = getelementptr inbounds nuw <{ i32, i32, ptr, ptr addrspace(1), ptr addrspace(1), i64, i8 }>, ptr [[BLOCK12_ASCAST]], i32 0, i32 1 @@ -605,9 +605,9 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: [[TMP21:%.*]] = load ptr addrspace(1), ptr [[C_ADDR_ASCAST]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store ptr addrspace(1) [[TMP21]], ptr [[BLOCK_CAPTURED26]], align 8, !tbaa [[TBAA7]] // GFX900-NEXT: store ptr [[BLOCK21_ASCAST]], ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]] -// GFX900-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]] +// GFX900-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP27_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]] // GFX900-NEXT: [[TMP24:%.*]] = load ptr, ptr addrspace(5) [[BLOCK20]], align 8, !tbaa [[TBAA16]] // GFX900-NEXT: [[TMP25:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP22]], i32 [[TMP23]], ptr addrspace(5) [[VARTMP27]], ptr addrspacecast (ptr addrspace(1) @__test_block_invoke_4_kernel.runtime.handle to ptr), ptr [[BLOCK21_ASCAST]]) // GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[BLOCK20]]) #[[ATTR9]] @@ -619,7 +619,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: convergent norecurse nounwind // GFX900-LABEL: define dso_local amdgpu_kernel void @test_target_features_kernel( -// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META22:![0-9]+]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR2]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23:![0-9]+]] !kernel_arg_type [[META24:![0-9]+]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25:![0-9]+]] { // GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[I_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[I_ADDR]] to ptr @@ -631,7 +631,7 @@ kernel void test_target_features_kernel(global int *i) { // // GFX900: Function Attrs: alwaysinline convergent norecurse nounwind // GFX900-LABEL: define dso_local void @__clang_ocl_kern_imp_test_target_features_kernel( -// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META22]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { +// GFX900-SAME: ptr addrspace(1) noundef align 4 [[I:%.*]]) #[[ATTR3]] !kernel_arg_addr_space [[META17]] !kernel_arg_access_qual [[META23]] !kernel_arg_type [[META24]] !kernel_arg_base_type [[META24]] !kernel_arg_type_qual [[META25]] { // GFX900-NEXT: [[ENTRY:.*:]] // GFX900-NEXT: [[I_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) // GFX900-NEXT: [[DEFAULT_QUEUE:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5) @@ -643,12 +643,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: store ptr addrspace(1) [[I]], ptr [[I_ADDR_ASCAST]], align 8, !tbaa [[TBAA26]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[DEFAULT_QUEUE]]) #[[ATTR9]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] -// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: store i32 0, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] // GFX900-NEXT: call void @llvm.lifetime.start.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] // GFX900-NEXT: [[TMP0:%.*]] = call i64 @llvm.amdgcn.s.memtime() -// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA19]] -// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA17]] -// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT21]] +// GFX900-NEXT: [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[DEFAULT_QUEUE]], align 8, !tbaa [[TBAA20]] +// GFX900-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(5) [[FLAGS]], align 4, !tbaa [[TBAA18]] +// GFX900-NEXT: call void @llvm.memcpy.p0.p5.i64(ptr align 4 [[TMP_ASCAST]], ptr addrspace(5) align 4 [[NDRANGE]], i64 4, i1 false), !tbaa.struct [[TBAA_STRUCT22]] // GFX900-NEXT: [[TMP3:%.*]] = call i32 @__enqueue_kernel_basic(ptr addrspace(1) [[TMP1]], i32 [[TMP2]], ptr addrspace(5) [[TMP]], ptr addrspacecast (ptr addrspace(1) @__test_target_features_kernel_block_invoke_kernel.runtime.handle to ptr), ptr addrspacecast (ptr addrspace(1) @__block_literal_global to ptr)) // GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[NDRANGE]]) #[[ATTR9]] // GFX900-NEXT: call void @llvm.lifetime.end.p5(ptr addrspace(5) [[FLAGS]]) #[[ATTR9]] @@ -740,7 +740,7 @@ kernel void test_target_features_kernel(global int *i) { // GFX900-NEXT: store i64 [[TMP2]], ptr addrspace(1) [[ARRAYIDX4]], align 8, !tbaa [[TBAA3]] // GFX900-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr [[LP_ADDR_ASCAST]], align 4, !tbaa [[TBAA32]] // GFX900-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr addrspace(3) [[TMP4]], i64 0 -// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA17]] +// GFX900-NEXT: store i32 1, ptr addrspace(3) [[ARRAYIDX5]], align 4, !tbaa [[TBAA18]] // GFX900-NEXT: ret void // // @@ -866,12 +866,12 @@ kernel void test_target_features_kernel(global int *i) { // GFX900: [[TBAA14]] = !{[[META15:![0-9]+]], [[META15]], i64 0} // GFX900: [[META15]] = !{!"p1 omnipotent char", [[META9]], i64 0} // GFX900: [[TBAA16]] = !{[[META5]], [[META5]], i64 0} -// GFX900: [[TBAA17]] = !{[[META18:![0-9]+]], [[META18]], i64 0} -// GFX900: [[META18]] = !{!"int", [[META5]], i64 0} -// GFX900: [[TBAA19]] = !{[[META20:![0-9]+]], [[META20]], i64 0} -// GFX900: [[META20]] = !{!"queue_t", [[META5]], i64 0} -// GFX900: [[TBAA_STRUCT21]] = !{i64 0, i64 4, [[TBAA17]]} -// GFX900: [[META22]] = !{i32 1} +// GFX900: [[META17]] = !{i32 1} +// GFX900: [[TBAA18]] = !{[[META19:![0-9]+]], [[META19]], i64 0} +// GFX900: [[META19]] = !{!"int", [[META5]], i64 0} +// GFX900: [[TBAA20]] = !{[[META21:![0-9]+]], [[META21]], i64 0} +// GFX900: [[META21]] = !{!"queue_t", [[META5]], i64 0} +// GFX900: [[TBAA_STRUCT22]] = !{i64 0, i64 4, [[TBAA18]]} // GFX900: [[META23]] = !{!"none"} // GFX900: [[META24]] = !{!"int*"} // GFX900: [[META25]] = !{!""} diff --git a/llvm/include/llvm/IR/FixedMetadataKinds.def b/llvm/include/llvm/IR/FixedMetadataKinds.def index d09cc15d65ff6..a7736166b8a44 100644 --- a/llvm/include/llvm/IR/FixedMetadataKinds.def +++ b/llvm/include/llvm/IR/FixedMetadataKinds.def @@ -55,3 +55,4 @@ LLVM_FIXED_MD_KIND(MD_mmra, "mmra", 40) LLVM_FIXED_MD_KIND(MD_noalias_addrspace, "noalias.addrspace", 41) LLVM_FIXED_MD_KIND(MD_callee_type, "callee_type", 42) LLVM_FIXED_MD_KIND(MD_nofree, "nofree", 43) +LLVM_FIXED_MD_KIND(MD_immutable, "immutable", 44) diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index 9b4f1dc6ddb34..f33d477fcbfee 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -102,6 +102,7 @@ using namespace llvm; #define DEBUG_TYPE "sroa" STATISTIC(NumAllocasAnalyzed, "Number of allocas analyzed for replacement"); +STATISTIC(NumConstAllocasPropagated, "Number of immutable allocas propageted"); STATISTIC(NumAllocaPartitions, "Number of alloca partitions formed"); STATISTIC(MaxPartitionsPerAlloca, "Maximum number of partitions per alloca"); STATISTIC(NumAllocaPartitionUses, "Number of alloca partition uses rewritten"); @@ -248,6 +249,7 @@ class SROA { bool presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS); AllocaInst *rewritePartition(AllocaInst &AI, AllocaSlices &AS, Partition &P); bool splitAlloca(AllocaInst &AI, AllocaSlices &AS); + bool tryToPropagateImmutableAllocaInitValue(AllocaInst &AI, AllocaSlices &AS); bool propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS); std::pair runOnAlloca(AllocaInst &AI); void clobberUse(Use &U); @@ -5452,6 +5454,47 @@ class BasicLoadAndStorePromoter : public LoadAndStorePromoter { Type *ZeroType; }; +bool SROA::tryToPropagateImmutableAllocaInitValue(AllocaInst &AI, + AllocaSlices &AS) { + // If an alloca of a scalar type is marked with the immutable metadata, it + // means that it cannot be reinitialized. Therefore, we can propagate its + // initial value quite early throughout, even if the alloca is escaped. + bool Changed = false; + SmallVector AIStoreUsers; + + copy_if(AI.users(), std::back_inserter(AIStoreUsers), [&AI](auto *U) { + auto *SI = dyn_cast(U); + return (SI && SI->getPointerOperand() == &AI); + }); + + if (range_size(AIStoreUsers) != 1) + return Changed; + + SmallVector AILoadUsers; + + copy_if(AI.users(), std::back_inserter(AILoadUsers), [&AI](auto *U) { + auto *LI = dyn_cast(U); + return (LI && LI->getPointerOperand() == &AI); + }); + + auto *StoreInitValInst = dyn_cast(AIStoreUsers.front()); + + assert(StoreInitValInst); + auto *InitVal = StoreInitValInst->getValueOperand(); + + for (User *U : AILoadUsers) { + auto *LI = dyn_cast(U); + assert(LI); + assert(DTU->getDomTree().dominates(InitVal, LI)); + assert(InitVal->getType() == LI->getType()); + ++NumConstAllocasPropagated; + LI->replaceAllUsesWith(InitVal); + Changed |= true; + } + + return Changed; +} + bool SROA::propagateStoredValuesToLoads(AllocaInst &AI, AllocaSlices &AS) { // Look through each "partition", looking for slices with the same start/end // that do not overlap with any before them. The slices are sorted by @@ -5564,8 +5607,11 @@ SROA::runOnAlloca(AllocaInst &AI) { // Build the slices using a recursive instruction-visiting builder. AllocaSlices AS(DL, AI); LLVM_DEBUG(AS.print(dbgs())); - if (AS.isEscaped()) + if (AS.isEscaped()) { + if (AI.hasMetadata(LLVMContext::MD_immutable)) + Changed |= tryToPropagateImmutableAllocaInitValue(AI, AS); return {Changed, CFGChanged}; + } if (AS.isEscapedReadOnly()) { Changed |= propagateStoredValuesToLoads(AI, AS); diff --git a/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll new file mode 100644 index 0000000000000..ada0a1107978a --- /dev/null +++ b/llvm/test/Transforms/SROA/sroa-immutable-alloca-propagation.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=sroa < %s | FileCheck %s + +; Function Attrs: nounwind uwtable +define dso_local i32 @test2(i16 noundef signext %p1) { +; CHECK-LABEL: define dso_local i32 @test2( +; CHECK-SAME: i16 noundef signext [[P1:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[P2:%.*]] = alloca i16, align 2, !immutable [[META0:![0-9]+]] +; CHECK-NEXT: call void @llvm.lifetime.start.p0(ptr [[P2]]) +; CHECK-NEXT: store i16 [[P1]], ptr [[P2]], align 2 +; CHECK-NEXT: call void @foo(ptr noundef [[P2]]) +; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[P1]] to i32 +; CHECK-NEXT: [[TMP0:%.*]] = load i16, ptr [[P2]], align 2 +; CHECK-NEXT: [[CONV1:%.*]] = sext i16 [[P1]] to i32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[CONV]], [[CONV1]] +; CHECK-NEXT: br i1 [[CMP]], label %[[IF_THEN:.*]], label %[[IF_END:.*]] +; CHECK: [[IF_THEN]]: +; CHECK-NEXT: br label %[[CLEANUP:.*]] +; CHECK: [[IF_END]]: +; CHECK-NEXT: br label %[[CLEANUP]] +; CHECK: [[CLEANUP]]: +; CHECK-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 14, %[[IF_THEN]] ], [ 42, %[[IF_END]] ] +; CHECK-NEXT: call void @llvm.lifetime.end.p0(ptr [[P2]]) +; CHECK-NEXT: ret i32 [[RETVAL_0]] +; +entry: + %retval = alloca i32, align 4 + %p1.addr = alloca i16, align 2 + %p2 = alloca i16, align 2, !immutable !1 + %cleanup.dest.slot = alloca i32, align 4 + store i16 %p1, ptr %p1.addr, align 2 + call void @llvm.lifetime.start.p0(ptr %p2) + %0 = load i16, ptr %p1.addr, align 2 + store i16 %0, ptr %p2, align 2 + call void @foo(ptr noundef %p2) + %1 = load i16, ptr %p1.addr, align 2 + %conv = sext i16 %1 to i32 + %2 = load i16, ptr %p2, align 2 + %conv1 = sext i16 %2 to i32 + %cmp = icmp eq i32 %conv, %conv1 + br i1 %cmp, label %if.then, label %if.end + +if.then: ; preds = %entry + store i32 14, ptr %retval, align 4 + store i32 1, ptr %cleanup.dest.slot, align 4 + br label %cleanup + +if.end: ; preds = %entry + store i32 42, ptr %retval, align 4 + store i32 1, ptr %cleanup.dest.slot, align 4 + br label %cleanup + +cleanup: ; preds = %if.end, %if.then + call void @llvm.lifetime.end.p0(ptr %p2) + %3 = load i32, ptr %retval, align 4 + ret i32 %3 +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.start.p0(ptr captures(none)) + +declare void @foo(ptr noundef) + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) +declare void @llvm.lifetime.end.p0(ptr captures(none)) + +!1 = !{i32 1} +;. +; CHECK: [[META0]] = !{i32 1} +;.