Skip to content

Commit

Permalink
SROA: Simplify addrspacecasted allocas with volatile accesses
Browse files Browse the repository at this point in the history
If the alloca is accessed through an addrspacecasted pointer, allow
the normal changes on the alloca. Cast back to the original use
address space instead of the new alloca's natural address space.
  • Loading branch information
arsenm committed Dec 2, 2022
1 parent d0b954c commit 2738789
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 49 deletions.
61 changes: 33 additions & 28 deletions llvm/lib/Transforms/Scalar/SROA.cpp
Expand Up @@ -778,10 +778,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&LI);

if (LI.isVolatile() &&
LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
return PI.setAborted(&LI);

if (isa<ScalableVectorType>(LI.getType()))
return PI.setAborted(&LI);

Expand All @@ -796,10 +792,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&SI);

if (SI.isVolatile() &&
SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
return PI.setAborted(&SI);

if (isa<ScalableVectorType>(ValOp->getType()))
return PI.setAborted(&SI);

Expand Down Expand Up @@ -837,11 +829,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);

// Don't replace this with a store with a different address space. TODO:
// Use a store with the casted new alloca?
if (II.isVolatile() && II.getDestAddressSpace() != DL.getAllocaAddrSpace())
return PI.setAborted(&II);

insertUse(II, Offset, Length ? Length->getLimitedValue()
: AllocSize - Offset.getLimitedValue(),
(bool)Length);
Expand All @@ -861,13 +848,6 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
if (!IsOffsetKnown)
return PI.setAborted(&II);

// Don't replace this with a load/store with a different address space.
// TODO: Use a store with the casted new alloca?
if (II.isVolatile() &&
(II.getDestAddressSpace() != DL.getAllocaAddrSpace() ||
II.getSourceAddressSpace() != DL.getAllocaAddrSpace()))
return PI.setAborted(&II);

// This side of the transfer is completely out-of-bounds, and so we can
// nuke the entire transfer. However, we also need to nuke the other side
// if already added to our partitions.
Expand Down Expand Up @@ -2335,6 +2315,16 @@ class llvm::sroa::AllocaSliceRewriter
// the insertion point is set to point to the user.
IRBuilderTy IRB;

// Return the new alloca, addrspacecasted if required to avoid changing the
// addrspace of a volatile access.
Value *getPtrToNewAI(unsigned AddrSpace, bool IsVolatile) {
if (!IsVolatile || AddrSpace == NewAI.getType()->getPointerAddressSpace())
return &NewAI;

Type *AccessTy = NewAI.getAllocatedType()->getPointerTo(AddrSpace);
return IRB.CreateAddrSpaceCast(&NewAI, AccessTy);
}

public:
AllocaSliceRewriter(const DataLayout &DL, AllocaSlices &AS, SROAPass &Pass,
AllocaInst &OldAI, AllocaInst &NewAI,
Expand Down Expand Up @@ -2535,7 +2525,9 @@ class llvm::sroa::AllocaSliceRewriter
(canConvertValue(DL, NewAllocaTy, TargetTy) ||
(IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
TargetTy->isIntegerTy()))) {
LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
Value *NewPtr =
getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile());
LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr,
NewAI.getAlign(), LI.isVolatile(),
LI.getName());
if (AATags)
Expand Down Expand Up @@ -2726,8 +2718,11 @@ class llvm::sroa::AllocaSliceRewriter
}

V = convertValue(DL, IRB, V, NewAllocaTy);
Value *NewPtr =
getPtrToNewAI(SI.getPointerAddressSpace(), SI.isVolatile());

NewSI =
IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), SI.isVolatile());
} else {
unsigned AS = SI.getPointerAddressSpace();
Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
Expand Down Expand Up @@ -2900,8 +2895,9 @@ class llvm::sroa::AllocaSliceRewriter
V = convertValue(DL, IRB, V, AllocaTy);
}

Value *NewPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
StoreInst *New =
IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
IRB.CreateAlignedStore(V, NewPtr, NewAI.getAlign(), II.isVolatile());
New->copyMetadata(II, {LLVMContext::MD_mem_parallel_loop_access,
LLVMContext::MD_access_group});
if (AATags)
Expand Down Expand Up @@ -3054,14 +3050,22 @@ class llvm::sroa::AllocaSliceRewriter
}
OtherPtrTy = OtherTy->getPointerTo(OtherAS);

Value *SrcPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
Value *AdjPtr = getAdjustedPtr(IRB, DL, OtherPtr, OtherOffset, OtherPtrTy,
OtherPtr->getName() + ".");
MaybeAlign SrcAlign = OtherAlign;
Value *DstPtr = &NewAI;
MaybeAlign DstAlign = SliceAlign;
if (!IsDest) {
std::swap(SrcPtr, DstPtr);
if (!IsDest)
std::swap(SrcAlign, DstAlign);

Value *SrcPtr;
Value *DstPtr;

if (IsDest) {
DstPtr = getPtrToNewAI(II.getDestAddressSpace(), II.isVolatile());
SrcPtr = AdjPtr;
} else {
DstPtr = AdjPtr;
SrcPtr = getPtrToNewAI(II.getSourceAddressSpace(), II.isVolatile());
}

Value *Src;
Expand Down Expand Up @@ -4713,7 +4717,8 @@ bool SROAPass::deleteDeadInstructions(
bool Changed = false;
while (!DeadInsts.empty()) {
Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());
if (!I) continue;
if (!I)
continue;
LLVM_DEBUG(dbgs() << "Deleting dead instruction: " << *I << "\n");

// If the instruction is an alloca, find the possible dbg.declare connected
Expand Down
2 changes: 2 additions & 0 deletions llvm/test/CodeGen/AMDGPU/flat-address-space.ll
Expand Up @@ -130,6 +130,7 @@ define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16
define amdgpu_kernel void @flat_scratch_unaligned_load() {
%scratch = alloca i32, addrspace(5)
%fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
store volatile i32* %fptr, i32* addrspace(3)* null
%ld = load volatile i32, i32* %fptr, align 1
ret void
}
Expand All @@ -142,6 +143,7 @@ define amdgpu_kernel void @flat_scratch_unaligned_load() {
define amdgpu_kernel void @flat_scratch_unaligned_store() {
%scratch = alloca i32, addrspace(5)
%fptr = addrspacecast i32 addrspace(5)* %scratch to i32*
store volatile i32* %fptr, i32* addrspace(3)* null
store volatile i32 0, i32* %fptr, align 1
ret void
}
Expand Down
93 changes: 74 additions & 19 deletions llvm/test/Transforms/SROA/addrspacecast.ll
Expand Up @@ -149,11 +149,11 @@ entry:
define i64 @alloca_addrspacecast_bitcast_volatile_store(i64 %X) {
; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_store(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8], align 1
; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: store volatile i64 [[X:%.*]], ptr addrspace(1) [[A_CAST]], align 4
; CHECK-NEXT: [[Z:%.*]] = load i64, ptr addrspace(1) [[A_CAST]], align 4
; CHECK-NEXT: ret i64 [[Z]]
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64, align 8
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1)
; CHECK-NEXT: store volatile i64 [[X:%.*]], ptr addrspace(1) [[TMP0]], align 8
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load i64, ptr [[A_SROA_0]], align 8
; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]]
;
entry:
%A = alloca [8 x i8]
Expand All @@ -163,15 +163,42 @@ entry:
ret i64 %Z
}

%struct = type { [256 x i8], i32 }

define i65 @volatile_store_addrspacecast_slice(i65 %X, i16 %idx) {
; CHECK-LABEL: @volatile_store_addrspacecast_slice(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4
; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 8
; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_1]] to ptr addrspace(1)
; CHECK-NEXT: store volatile i65 [[X:%.*]], ptr addrspace(1) [[A_SROA_1_0_GEPB_SROA_CAST]], align 8
; CHECK-NEXT: br label [[L2:%.*]]
; CHECK: L2:
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load i65, ptr [[A_SROA_0]], align 4
; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]]
;
entry:
%A = alloca %struct
%B = addrspacecast ptr %A to ptr addrspace(1)
%gepA = getelementptr %struct, ptr %A, i32 0, i32 0, i16 20
%gepB = getelementptr i65, ptr addrspace(1) %B, i16 6
store volatile i65 %X, ptr addrspace(1) %gepB, align 1
br label %L2

L2:
%Z = load i65, ptr %gepA, align 1
ret i65 %Z
}

; Don't change the address space of a volatile operation
define i64 @alloca_addrspacecast_bitcast_volatile_load(i64 %X) {
; CHECK-LABEL: @alloca_addrspacecast_bitcast_volatile_load(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca [8 x i8], align 1
; CHECK-NEXT: [[A_CAST:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: store i64 [[X:%.*]], ptr addrspace(1) [[A_CAST]], align 4
; CHECK-NEXT: [[Z:%.*]] = load volatile i64, ptr addrspace(1) [[A_CAST]], align 4
; CHECK-NEXT: ret i64 [[Z]]
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i64, align 8
; CHECK-NEXT: store i64 [[X:%.*]], ptr [[A_SROA_0]], align 8
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1)
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_Z:%.*]] = load volatile i64, ptr addrspace(1) [[TMP0]], align 8
; CHECK-NEXT: ret i64 [[A_SROA_0_0_A_SROA_0_0_Z]]
;
entry:
%A = alloca [8 x i8]
Expand All @@ -183,15 +210,40 @@ entry:

declare void @llvm.memset.p1.i32(ptr addrspace(1) nocapture, i8, i32, i1) nounwind

define i65 @volatile_load_addrspacecast_slice(i65 %X, i16 %idx) {
; CHECK-LABEL: @volatile_load_addrspacecast_slice(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [9 x i8], align 4
; CHECK-NEXT: [[A_SROA_1:%.*]] = alloca [9 x i8], align 8
; CHECK-NEXT: [[A_SROA_1_0_GEPB_SROA_CAST:%.*]] = addrspacecast ptr [[A_SROA_1]] to ptr addrspace(1)
; CHECK-NEXT: store i65 [[X:%.*]], ptr addrspace(1) [[A_SROA_1_0_GEPB_SROA_CAST]], align 8
; CHECK-NEXT: br label [[L2:%.*]]
; CHECK: L2:
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_20_Z:%.*]] = load volatile i65, ptr [[A_SROA_0]], align 4
; CHECK-NEXT: ret i65 [[A_SROA_0_0_A_SROA_0_20_Z]]
;
entry:
%A = alloca %struct
%B = addrspacecast ptr %A to ptr addrspace(1)
%gepA = getelementptr %struct, ptr %A, i32 0, i32 0, i16 20
%gepB = getelementptr i65, ptr addrspace(1) %B, i16 6
store i65 %X, ptr addrspace(1) %gepB, align 1
br label %L2

L2:
%Z = load volatile i65, ptr %gepA, align 1
ret i65 %Z
}

; Don't change the address space of a volatile operation
define i32 @volatile_memset() {
; CHECK-LABEL: @volatile_memset(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1
; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: call void @llvm.memset.p1.i32(ptr addrspace(1) [[ASC]], i8 42, i32 4, i1 true)
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[A]], align 4
; CHECK-NEXT: ret i32 [[VAL]]
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1)
; CHECK-NEXT: store volatile i32 707406378, ptr addrspace(1) [[TMP0]], align 4
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_VAL:%.*]] = load i32, ptr [[A_SROA_0]], align 4
; CHECK-NEXT: ret i32 [[A_SROA_0_0_A_SROA_0_0_VAL]]
;
entry:
%a = alloca [4 x i8]
Expand All @@ -205,10 +257,13 @@ entry:
define void @volatile_memcpy(ptr %src, ptr %dst) {
; CHECK-LABEL: @volatile_memcpy(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca [4 x i8], align 1
; CHECK-NEXT: [[ASC:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(1)
; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) [[ASC]], ptr [[SRC:%.*]], i32 4, i1 true), !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: call void @llvm.memcpy.p0.p1.i32(ptr [[DST:%.*]], ptr addrspace(1) [[ASC]], i32 4, i1 true), !tbaa [[TBAA3:![0-9]+]]
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca i32, align 4
; CHECK-NEXT: [[A_SROA_0_0_COPYLOAD:%.*]] = load volatile i32, ptr [[SRC:%.*]], align 1, !tbaa [[TBAA0:![0-9]+]]
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1)
; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_COPYLOAD]], ptr addrspace(1) [[TMP0]], align 4, !tbaa [[TBAA0]]
; CHECK-NEXT: [[TMP1:%.*]] = addrspacecast ptr [[A_SROA_0]] to ptr addrspace(1)
; CHECK-NEXT: [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1:%.*]] = load volatile i32, ptr addrspace(1) [[TMP1]], align 4, !tbaa [[TBAA3:![0-9]+]]
; CHECK-NEXT: store volatile i32 [[A_SROA_0_0_A_SROA_0_0_COPYLOAD1]], ptr [[DST:%.*]], align 1, !tbaa [[TBAA3]]
; CHECK-NEXT: ret void
;
entry:
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SROA/basictest.ll
Expand Up @@ -1197,9 +1197,9 @@ define void @PR14105_as1(ptr addrspace(1) %ptr) {
;
; CHECK-LABEL: @PR14105_as1(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[A:%.*]] = alloca { [16 x i8] }, align 8
; CHECK-NEXT: [[A_SROA_0:%.*]] = alloca [16 x i8], align 8
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds { [16 x i8] }, ptr addrspace(1) [[PTR:%.*]], i64 -1
; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 8 [[GEP]], ptr align 8 [[A]], i32 16, i1 true)
; CHECK-NEXT: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 8 [[GEP]], ptr align 8 [[A_SROA_0]], i32 16, i1 true)
; CHECK-NEXT: ret void
;
entry:
Expand Down

0 comments on commit 2738789

Please sign in to comment.