Skip to content

Commit 8fab811

Browse files
[LSR] Add an addressing mode that considers all addressing modes (#158110)
The way that loops strength reduction works is that the target has to upfront decide whether it wants its addressing to be preindex, postindex, or neither. This choice affects: * Which potential solutions we generate * Whether we consider a pre/post index load/store as costing an AddRec or not. None of these choices are a good fit for either AArch64 or ARM, where both preindex and postindex addressing are typically free: * If we pick None then we count pre/post index addressing as costing one addrec more than is correct so we don't pick them when we should. * If we pick PreIndexed or PostIndexed then we get the correct cost for that addressing type, but still get it wrong for the other and also exclude potential solutions using offset addressing that could have less cost. This patch adds an "all" addressing mode that causes all potential solutions to be generated and counts both pre and postindex as having AddRecCost of zero. Unfortuntely this reveals problems elsewhere in how we calculate the cost of things that need to be fixed before we can make use of it.
1 parent aa1a694 commit 8fab811

File tree

3 files changed

+199
-19
lines changed

3 files changed

+199
-19
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include "llvm/ADT/APInt.h"
2525
#include "llvm/ADT/ArrayRef.h"
26+
#include "llvm/ADT/BitmaskEnum.h"
2627
#include "llvm/Analysis/IVDescriptors.h"
2728
#include "llvm/IR/FMF.h"
2829
#include "llvm/IR/InstrTypes.h"
@@ -796,10 +797,13 @@ class TargetTransformInfo {
796797
LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC,
797798
TargetLibraryInfo *LibInfo) const;
798799

800+
/// Which addressing mode Loop Strength Reduction will try to generate.
799801
enum AddressingModeKind {
800-
AMK_PreIndexed,
801-
AMK_PostIndexed,
802-
AMK_None
802+
AMK_None = 0x0, ///< Don't prefer any addressing mode
803+
AMK_PreIndexed = 0x1, ///< Prefer pre-indexed addressing mode
804+
AMK_PostIndexed = 0x2, ///< Prefer post-indexed addressing mode
805+
AMK_All = 0x3, ///< Consider all addressing modes
806+
LLVM_MARK_AS_BITMASK_ENUM(/*LargestValue=*/AMK_All)
803807
};
804808

805809
/// Return the preferred addressing mode LSR should make efforts to generate.

llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp

Lines changed: 14 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -167,17 +167,15 @@ static cl::opt<bool> FilterSameScaledReg(
167167
" with the same ScaledReg and Scale"));
168168

169169
static cl::opt<TTI::AddressingModeKind> PreferredAddresingMode(
170-
"lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171-
cl::desc("A flag that overrides the target's preferred addressing mode."),
172-
cl::values(clEnumValN(TTI::AMK_None,
173-
"none",
174-
"Don't prefer any addressing mode"),
175-
clEnumValN(TTI::AMK_PreIndexed,
176-
"preindexed",
177-
"Prefer pre-indexed addressing mode"),
178-
clEnumValN(TTI::AMK_PostIndexed,
179-
"postindexed",
180-
"Prefer post-indexed addressing mode")));
170+
"lsr-preferred-addressing-mode", cl::Hidden, cl::init(TTI::AMK_None),
171+
cl::desc("A flag that overrides the target's preferred addressing mode."),
172+
cl::values(
173+
clEnumValN(TTI::AMK_None, "none", "Don't prefer any addressing mode"),
174+
clEnumValN(TTI::AMK_PreIndexed, "preindexed",
175+
"Prefer pre-indexed addressing mode"),
176+
clEnumValN(TTI::AMK_PostIndexed, "postindexed",
177+
"Prefer post-indexed addressing mode"),
178+
clEnumValN(TTI::AMK_All, "all", "Consider all addressing modes")));
181179

182180
static cl::opt<unsigned> ComplexityLimit(
183181
"lsr-complexity-limit", cl::Hidden,
@@ -1404,7 +1402,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
14041402
// for now LSR only handles innermost loops).
14051403
if (AR->getLoop() != L) {
14061404
// If the AddRec exists, consider it's register free and leave it alone.
1407-
if (isExistingPhi(AR, *SE) && AMK != TTI::AMK_PostIndexed)
1405+
if (isExistingPhi(AR, *SE) && !(AMK & TTI::AMK_PostIndexed))
14081406
return;
14091407

14101408
// It is bad to allow LSR for current loop to add induction variables
@@ -1427,9 +1425,9 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
14271425
if (match(AR, m_scev_AffineAddRec(m_SCEV(Start), m_SCEVConstant(Step))))
14281426
// If the step size matches the base offset, we could use pre-indexed
14291427
// addressing.
1430-
if ((AMK == TTI::AMK_PreIndexed && F.BaseOffset.isFixed() &&
1428+
if (((AMK & TTI::AMK_PreIndexed) && F.BaseOffset.isFixed() &&
14311429
Step->getAPInt() == F.BaseOffset.getFixedValue()) ||
1432-
(AMK == TTI::AMK_PostIndexed && !isa<SCEVConstant>(Start) &&
1430+
((AMK & TTI::AMK_PostIndexed) && !isa<SCEVConstant>(Start) &&
14331431
SE->isLoopInvariant(Start, L)))
14341432
LoopCost = 0;
14351433
}
@@ -4147,7 +4145,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
41474145
// means that a single pre-indexed access can be generated to become the new
41484146
// base pointer for each iteration of the loop, resulting in no extra add/sub
41494147
// instructions for pointer updating.
4150-
if (AMK == TTI::AMK_PreIndexed && LU.Kind == LSRUse::Address) {
4148+
if ((AMK & TTI::AMK_PreIndexed) && LU.Kind == LSRUse::Address) {
41514149
const APInt *StepInt;
41524150
if (match(G, m_scev_AffineAddRec(m_SCEV(), m_scev_APInt(StepInt)))) {
41534151
int64_t Step = StepInt->isNegative() ? StepInt->getSExtValue()
@@ -5437,7 +5435,7 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
54375435
// This can sometimes (notably when trying to favour postinc) lead to
54385436
// sub-optimial decisions. There it is best left to the cost modelling to
54395437
// get correct.
5440-
if (AMK != TTI::AMK_PostIndexed || LU.Kind != LSRUse::Address) {
5438+
if (!(AMK & TTI::AMK_PostIndexed) || LU.Kind != LSRUse::Address) {
54415439
int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
54425440
for (const SCEV *Reg : ReqRegs) {
54435441
if ((F.ScaledReg && F.ScaledReg == Reg) ||
Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,178 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt -S -mtriple=aarch64-none-elf -loop-reduce -lsr-preferred-addressing-mode=all < %s | FileCheck %s
3+
4+
define i32 @postindex_loop(ptr %p, i64 %n) {
5+
; CHECK-LABEL: define i32 @postindex_loop(
6+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
7+
; CHECK-NEXT: [[ENTRY:.*]]:
8+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
9+
; CHECK: [[FOR_BODY]]:
10+
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
11+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
12+
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
13+
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
14+
; CHECK-NEXT: [[ADD]] = add i32 [[RET]], [[VAL]]
15+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
16+
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
17+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
18+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
19+
; CHECK: [[EXIT]]:
20+
; CHECK-NEXT: ret i32 [[ADD]]
21+
;
22+
entry:
23+
br label %for.body
24+
25+
for.body:
26+
%idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
27+
%ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
28+
%arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
29+
%val = load i32, ptr %arrayidx, align 4
30+
%add = add i32 %ret, %val
31+
%idx.next = add nuw nsw i64 %idx, 1
32+
%exitcond = icmp eq i64 %idx.next, %n
33+
br i1 %exitcond, label %exit, label %for.body
34+
35+
exit:
36+
ret i32 %add
37+
}
38+
39+
; Preindex saves a setup instruction compared to postindex
40+
; FIXME: We currently don't recognize that preindex is possible here
41+
define i32 @preindex_loop(ptr %p, i64 %n) {
42+
; CHECK-LABEL: define i32 @preindex_loop(
43+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
44+
; CHECK-NEXT: [[ENTRY:.*]]:
45+
; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr nuw i8, ptr [[P]], i64 4
46+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
47+
; CHECK: [[FOR_BODY]]:
48+
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_BODY]] ], [ [[SCEVGEP]], %[[ENTRY]] ]
49+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi i64 [ [[LSR_IV_NEXT:%.*]], %[[FOR_BODY]] ], [ [[N]], %[[ENTRY]] ]
50+
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[ADD:%.*]], %[[FOR_BODY]] ], [ 0, %[[ENTRY]] ]
51+
; CHECK-NEXT: [[VAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
52+
; CHECK-NEXT: [[ADD]] = add i32 [[RET]], [[VAL]]
53+
; CHECK-NEXT: [[LSR_IV_NEXT]] = add i64 [[LSR_IV]], -1
54+
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
55+
; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[LSR_IV_NEXT]], 0
56+
; CHECK-NEXT: br i1 [[EXITCOND]], label %[[EXIT:.*]], label %[[FOR_BODY]]
57+
; CHECK: [[EXIT]]:
58+
; CHECK-NEXT: ret i32 [[ADD]]
59+
;
60+
entry:
61+
br label %for.body
62+
63+
for.body:
64+
%idx = phi i64 [ %idx.next, %for.body ], [ 0, %entry ]
65+
%ret = phi i32 [ %add, %for.body ], [ 0, %entry ]
66+
%idx.next = add nuw nsw i64 %idx, 1
67+
%arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx.next
68+
%val = load i32, ptr %arrayidx, align 4
69+
%add = add i32 %ret, %val
70+
%exitcond = icmp eq i64 %idx.next, %n
71+
br i1 %exitcond, label %exit, label %for.body
72+
73+
exit:
74+
ret i32 %add
75+
}
76+
77+
; We should use offset addressing here as postindex uses an extra register.
78+
; FIXME: We currently use postindex as we don't realize the load of val2 is also
79+
; a use of p that needs it to be live in the loop.
80+
define i64 @offset_loop(ptr %p, i64 %n) {
81+
; CHECK-LABEL: define i64 @offset_loop(
82+
; CHECK-SAME: ptr [[P:%.*]], i64 [[N:%.*]]) {
83+
; CHECK-NEXT: [[ENTRY:.*]]:
84+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
85+
; CHECK: [[FOR_BODY]]:
86+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_BODY]] ], [ [[P]], %[[ENTRY]] ]
87+
; CHECK-NEXT: [[RET:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[ADD:%.*]], %[[FOR_BODY]] ]
88+
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IDX_NEXT:%.*]], %[[FOR_BODY]] ]
89+
; CHECK-NEXT: [[VAL1:%.*]] = load i64, ptr [[LSR_IV]], align 4
90+
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw i64, ptr [[P]], i64 [[VAL1]]
91+
; CHECK-NEXT: [[VAL2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 4
92+
; CHECK-NEXT: [[ADD]] = add i64 [[VAL2]], [[RET]]
93+
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
94+
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 8
95+
; CHECK-NEXT: [[CMP:%.*]] = icmp eq i64 [[IDX_NEXT]], [[VAL1]]
96+
; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_END:.*]], label %[[FOR_BODY]]
97+
; CHECK: [[FOR_END]]:
98+
; CHECK-NEXT: ret i64 [[ADD]]
99+
;
100+
entry:
101+
br label %for.body
102+
103+
for.body:
104+
%ret = phi i64 [ 0, %entry ], [ %add, %for.body ]
105+
%idx = phi i64 [ 0, %entry ], [ %idx.next, %for.body ]
106+
%arrayidx1 = getelementptr inbounds nuw i64, ptr %p, i64 %idx
107+
%val1 = load i64, ptr %arrayidx1, align 4
108+
%arrayidx2 = getelementptr inbounds nuw i64, ptr %p, i64 %val1
109+
%val2 = load i64, ptr %arrayidx2, align 4
110+
%add = add i64 %val2, %ret
111+
%idx.next = add nuw nsw i64 %idx, 1
112+
%cmp = icmp eq i64 %idx.next, %val1
113+
br i1 %cmp, label %for.end, label %for.body
114+
115+
for.end:
116+
ret i64 %add
117+
}
118+
119+
; We can't use postindex addressing on the conditional load of qval and can't
120+
; convert the loop condition to a compare with zero, so we should instead use
121+
; offset addressing.
122+
; FIXME: Currently we don't notice the load of qval is conditional, and attempt
123+
; postindex addressing anyway.
124+
define i32 @conditional_load(ptr %p, ptr %q, ptr %n) {
125+
; CHECK-LABEL: define i32 @conditional_load(
126+
; CHECK-SAME: ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[N:%.*]]) {
127+
; CHECK-NEXT: [[ENTRY:.*]]:
128+
; CHECK-NEXT: br label %[[FOR_BODY:.*]]
129+
; CHECK: [[FOR_BODY]]:
130+
; CHECK-NEXT: [[LSR_IV1:%.*]] = phi ptr [ [[SCEVGEP2:%.*]], %[[FOR_INC:.*]] ], [ [[P]], %[[ENTRY]] ]
131+
; CHECK-NEXT: [[LSR_IV:%.*]] = phi ptr [ [[SCEVGEP:%.*]], %[[FOR_INC]] ], [ [[Q]], %[[ENTRY]] ]
132+
; CHECK-NEXT: [[IDX:%.*]] = phi i64 [ [[IDX_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
133+
; CHECK-NEXT: [[RET:%.*]] = phi i32 [ [[RET_NEXT:%.*]], %[[FOR_INC]] ], [ 0, %[[ENTRY]] ]
134+
; CHECK-NEXT: [[PVAL:%.*]] = load i32, ptr [[LSR_IV1]], align 4
135+
; CHECK-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[PVAL]], 0
136+
; CHECK-NEXT: [[SCEVGEP2]] = getelementptr i8, ptr [[LSR_IV1]], i64 4
137+
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label %[[FOR_INC]], label %[[IF_THEN:.*]]
138+
; CHECK: [[IF_THEN]]:
139+
; CHECK-NEXT: [[QVAL:%.*]] = load i32, ptr [[LSR_IV]], align 4
140+
; CHECK-NEXT: [[ADD:%.*]] = add i32 [[RET]], [[QVAL]]
141+
; CHECK-NEXT: br label %[[FOR_INC]]
142+
; CHECK: [[FOR_INC]]:
143+
; CHECK-NEXT: [[RET_NEXT]] = phi i32 [ [[ADD]], %[[IF_THEN]] ], [ [[RET]], %[[FOR_BODY]] ]
144+
; CHECK-NEXT: [[IDX_NEXT]] = add nuw nsw i64 [[IDX]], 1
145+
; CHECK-NEXT: [[NVAL:%.*]] = load volatile i64, ptr [[N]], align 8
146+
; CHECK-NEXT: [[SCEVGEP]] = getelementptr i8, ptr [[LSR_IV]], i64 4
147+
; CHECK-NEXT: [[CMP:%.*]] = icmp slt i64 [[IDX_NEXT]], [[NVAL]]
148+
; CHECK-NEXT: br i1 [[CMP]], label %[[FOR_BODY]], label %[[EXIT:.*]]
149+
; CHECK: [[EXIT]]:
150+
; CHECK-NEXT: ret i32 [[RET_NEXT]]
151+
;
152+
entry:
153+
br label %for.body
154+
155+
for.body:
156+
%idx = phi i64 [ %idx.next, %for.inc ], [ 0, %entry ]
157+
%ret = phi i32 [ %ret.next, %for.inc ], [ 0, %entry ]
158+
%arrayidx = getelementptr inbounds nuw i32, ptr %p, i64 %idx
159+
%pval = load i32, ptr %arrayidx, align 4
160+
%tobool.not = icmp eq i32 %pval, 0
161+
br i1 %tobool.not, label %for.inc, label %if.then
162+
163+
if.then:
164+
%arrayidx1 = getelementptr inbounds nuw i32, ptr %q, i64 %idx
165+
%qval = load i32, ptr %arrayidx1, align 4
166+
%add = add i32 %ret, %qval
167+
br label %for.inc
168+
169+
for.inc:
170+
%ret.next = phi i32 [ %add, %if.then ], [ %ret, %for.body ]
171+
%idx.next = add nuw nsw i64 %idx, 1
172+
%nval = load volatile i64, ptr %n, align 8
173+
%cmp = icmp slt i64 %idx.next, %nval
174+
br i1 %cmp, label %for.body, label %exit
175+
176+
exit:
177+
ret i32 %ret.next
178+
}

0 commit comments

Comments
 (0)