Skip to content

Commit

Permalink
[AMDGPU] Unroll preferences improvements
Browse files Browse the repository at this point in the history
Exit loop analysis early if suitable private access found.
Do not account for GEPs which are invariant to loop induction variable.
Do not account for Allocas which are too big to fit into register file anyway.
Add option for tuning: -amdgpu-unroll-threshold-private.

Differential Revision: https://reviews.llvm.org/D29473

llvm-svn: 293991
  • Loading branch information
rampitec committed Feb 3, 2017
1 parent f60b684 commit f29602d
Show file tree
Hide file tree
Showing 2 changed files with 148 additions and 1 deletion.
29 changes: 28 additions & 1 deletion llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
Expand Up @@ -29,6 +29,10 @@ using namespace llvm;

#define DEBUG_TYPE "AMDGPUtti"

static cl::opt<unsigned> UnrollThresholdPrivate(
"amdgpu-unroll-threshold-private",
cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
cl::init(800), cl::Hidden);

void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
TTI::UnrollingPreferences &UP) {
Expand All @@ -38,6 +42,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,

// TODO: Do we want runtime unrolling?

// Maximum alloca size than can fit registers. Reserve 16 registers.
const unsigned MaxAlloca = (256 - 16) * 4;
for (const BasicBlock *BB : L->getBlocks()) {
const DataLayout &DL = BB->getModule()->getDataLayout();
for (const Instruction &I : *BB) {
Expand All @@ -49,6 +55,26 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
const AllocaInst *Alloca =
dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
if (Alloca) {
Type *Ty = Alloca->getAllocatedType();
unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
if (AllocaSize > MaxAlloca)
continue;

// Check if GEP depends on a value defined by this loop itself.
bool HasLoopDef = false;
for (const Value *Op : GEP->operands()) {
const Instruction *Inst = dyn_cast<Instruction>(Op);
if (!Inst || L->isLoopInvariant(Op))
continue;
if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
return SubLoop->contains(Inst); }))
continue;
HasLoopDef = true;
break;
}
if (!HasLoopDef)
continue;

// We want to do whatever we can to limit the number of alloca
// instructions that make it through to the code generator. allocas
// require us to use indirect addressing, which is slow and prone to
Expand All @@ -59,7 +85,8 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
//
// Don't use the maximum allowed value here as it will make some
// programs way too big.
UP.Threshold = 800;
UP.Threshold = UnrollThresholdPrivate;
return;
}
}
}
Expand Down
120 changes: 120 additions & 0 deletions llvm/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -0,0 +1,120 @@
; RUN: opt -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s

; Check that we full unroll loop to be able to eliminate alloca
; CHECK-LABEL: @non_invariant_ind
; CHECK: for.body:
; CHECK-NOT: br
; CHECK: store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
; CHECK: ret void

define void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
entry:
%arr = alloca [64 x i32], align 4
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
br label %for.body

for.cond.cleanup: ; preds = %for.body
%arrayidx5 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
%tmp15 = load i32, i32* %arrayidx5, align 4
%arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
ret void

for.body: ; preds = %for.body, %entry
%i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%idxprom = sext i32 %i.015 to i64
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
%tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
%add = add nsw i32 %i.015, %tmp1
%rem = srem i32 %add, 64
%arrayidx3 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
store i32 %tmp16, i32* %arrayidx3, align 4
%inc = add nuw nsw i32 %i.015, 1
%exitcond = icmp eq i32 %inc, 100
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

; Check that we unroll inner loop but not outer
; CHECK-LABEL: @invariant_ind
; CHECK: %[[exitcond:[^ ]+]] = icmp eq i32 %{{.*}}, 32
; CHECK: br i1 %[[exitcond]]
; CHECK-NOT: icmp eq i32 %{{.*}}, 100

define void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
entry:
%arr = alloca [64 x i32], align 4
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
br label %for.cond2.preheader

for.cond2.preheader: ; preds = %for.cond.cleanup5, %entry
%i.026 = phi i32 [ 0, %entry ], [ %inc10, %for.cond.cleanup5 ]
%idxprom = sext i32 %i.026 to i64
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
%tmp15 = load i32, i32 addrspace(1)* %arrayidx, align 4
br label %for.body6

for.cond.cleanup: ; preds = %for.cond.cleanup5
%arrayidx13 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
%tmp16 = load i32, i32* %arrayidx13, align 4
%arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
store i32 %tmp16, i32 addrspace(1)* %arrayidx15, align 4
ret void

for.cond.cleanup5: ; preds = %for.body6
%inc10 = add nuw nsw i32 %i.026, 1
%exitcond27 = icmp eq i32 %inc10, 32
br i1 %exitcond27, label %for.cond.cleanup, label %for.cond2.preheader

for.body6: ; preds = %for.body6, %for.cond2.preheader
%j.025 = phi i32 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ]
%add = add nsw i32 %j.025, %tmp1
%rem = srem i32 %add, 64
%arrayidx8 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
store i32 %tmp15, i32* %arrayidx8, align 4
%inc = add nuw nsw i32 %j.025, 1
%exitcond = icmp eq i32 %inc, 100
br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
}

; Check we do not enforce unroll if alloca is too big
; CHECK-LABEL: @too_big
; CHECK: for.body:
; CHECK: icmp eq i32 %{{.*}}, 100
; CHECK: br

define void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
entry:
%arr = alloca [256 x i32], align 4
%tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
br label %for.body

for.cond.cleanup: ; preds = %for.body
%arrayidx5 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %x
%tmp15 = load i32, i32* %arrayidx5, align 4
%arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
ret void

for.body: ; preds = %for.body, %entry
%i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
%idxprom = sext i32 %i.015 to i64
%arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
%tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
%add = add nsw i32 %i.015, %tmp1
%rem = srem i32 %add, 64
%arrayidx3 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %rem
store i32 %tmp16, i32* %arrayidx3, align 4
%inc = add nuw nsw i32 %i.015, 1
%exitcond = icmp eq i32 %inc, 100
br i1 %exitcond, label %for.cond.cleanup, label %for.body
}

declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1

declare i32 @llvm.amdgcn.workitem.id.x() #1

declare i32 @llvm.amdgcn.workgroup.id.x() #1

declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1

attributes #1 = { nounwind readnone }

0 comments on commit f29602d

Please sign in to comment.