Skip to content

Commit 3ea089b

Browse files
authored
[AArch64] Enable RT and partial unrolling with reductions for Apple CPUs. (#149699)
Update unrolling preferences for Apple Silicon CPUs to enable partial unrolling and runtime unrolling for small loops with reductions. This builds on top of unroller changes to introduce parallel reduction phis, if possible: #149470. PR: #149699
1 parent 86b1e1b commit 3ea089b

File tree

6 files changed

+150
-25
lines changed

6 files changed

+150
-25
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -638,6 +638,8 @@ class TargetTransformInfo {
638638
/// Fall back to the generic logic to determine whether multi-exit unrolling
639639
/// is profitable if set to false.
640640
bool RuntimeUnrollMultiExit;
641+
/// Allow unrolling to add parallel reduction phis.
642+
bool AddAdditionalAccumulators;
641643
};
642644

643645
/// Get target-customized preferences for the generic loop unrolling

llvm/include/llvm/Transforms/Utils/UnrollLoop.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ struct UnrollLoopOptions {
7979
const Instruction *Heart = nullptr;
8080
unsigned SCEVExpansionBudget;
8181
bool RuntimeUnrollMultiExit = false;
82+
bool AddAdditionalAccumulators = false;
8283
};
8384

8485
LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO,

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "llvm/Support/Debug.h"
2626
#include "llvm/TargetParser/AArch64TargetParser.h"
2727
#include "llvm/Transforms/InstCombine/InstCombiner.h"
28+
#include "llvm/Transforms/Utils/UnrollLoop.h"
2829
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
2930
#include <algorithm>
3031
#include <optional>
@@ -4969,6 +4970,23 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49694970
if (!L->getExitBlock())
49704971
return;
49714972

4973+
// Check if the loop contains any reductions that could be parallelized when
4974+
// unrolling. If so, enable partial unrolling, if the trip count is know to be
4975+
// a multiple of 2.
4976+
bool HasParellelizableReductions =
4977+
L->getNumBlocks() == 1 &&
4978+
any_of(L->getHeader()->phis(),
4979+
[&SE, L](PHINode &Phi) {
4980+
return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
4981+
}) &&
4982+
isLoopSizeWithinBudget(L, TTI, 12, nullptr);
4983+
if (HasParellelizableReductions &&
4984+
SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
4985+
UP.Partial = true;
4986+
UP.MaxCount = 4;
4987+
UP.AddAdditionalAccumulators = true;
4988+
}
4989+
49724990
const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
49734991
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
49744992
(SE.getSmallConstantMaxTripCount(L) > 0 &&
@@ -4984,6 +5002,12 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49845002
// Limit to loops with trip counts that are cheap to expand.
49855003
UP.SCEVExpansionBudget = 1;
49865004

5005+
if (HasParellelizableReductions) {
5006+
UP.Runtime = true;
5007+
UP.DefaultUnrollRuntimeCount = 4;
5008+
UP.AddAdditionalAccumulators = true;
5009+
}
5010+
49875011
// Try to unroll small loops, of few-blocks with low budget, if they have
49885012
// load/store dependencies, to expose more parallel memory access streams,
49895013
// or if they do little work inside a block (i.e. load -> X -> store pattern).

llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
220220
UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
221221
UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
222222
UP.RuntimeUnrollMultiExit = false;
223+
UP.AddAdditionalAccumulators = false;
223224

224225
// Override with any target specific settings
225226
TTI.getUnrollingPreferences(L, SE, UP, &ORE);
@@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
13541355
ULO.Heart = getLoopConvergenceHeart(L);
13551356
ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
13561357
ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
1358+
ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
13571359
LoopUnrollResult UnrollResult = UnrollLoop(
13581360
L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
13591361
if (UnrollResult == LoopUnrollResult::Unmodified)

llvm/lib/Transforms/Utils/LoopUnroll.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -671,8 +671,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
671671
// to not exit.
672672
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
673673
bool CanAddAdditionalAccumulators =
674-
UnrollAddParallelReductions && !CompletelyUnroll &&
675-
L->getNumBlocks() == 1 &&
674+
(UnrollAddParallelReductions.getNumOccurrences() > 0
675+
? UnrollAddParallelReductions
676+
: ULO.AddAdditionalAccumulators) &&
677+
!CompletelyUnroll && L->getNumBlocks() == 1 &&
676678
(ULO.Runtime ||
677679
(ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
678680
ExitInfos[Header].BreakoutTrip == 0))));

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 117 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -783,16 +783,34 @@ define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) {
783783
; APPLE-NEXT: [[ENTRY:.*]]:
784784
; APPLE-NEXT: br label %[[LOOP:.*]]
785785
; APPLE: [[LOOP]]:
786-
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
786+
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
787+
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
788+
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
789+
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
787790
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
788791
; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
789792
; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
790-
; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
791-
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
792-
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
793-
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
793+
; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP0]]
794+
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
795+
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
796+
; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
797+
; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP1]]
798+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
799+
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
800+
; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
801+
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP2]]
802+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
803+
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
804+
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
805+
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]]
806+
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
807+
; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
808+
; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
794809
; APPLE: [[EXIT]]:
795-
; APPLE-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
810+
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
811+
; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
812+
; APPLE-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
813+
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]]
796814
; APPLE-NEXT: ret i32 [[BIN_RDX2]]
797815
;
798816
; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial(
@@ -923,21 +941,42 @@ define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) {
923941
; APPLE-NEXT: [[ENTRY:.*]]:
924942
; APPLE-NEXT: br label %[[LOOP:.*]]
925943
; APPLE: [[LOOP]]:
926-
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
927-
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
928-
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
944+
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
945+
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3:%.*]], %[[LOOP]] ]
946+
; APPLE-NEXT: [[RDX_21:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
947+
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
948+
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RES_2:%.*]], %[[LOOP]] ]
949+
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
929950
; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
930951
; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
931-
; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
932-
; APPLE-NEXT: [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]]
933-
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
934-
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
935-
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
952+
; APPLE-NEXT: [[RES_2]] = add i32 [[RDX]], [[TMP0]]
953+
; APPLE-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]]
954+
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
955+
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
956+
; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
957+
; APPLE-NEXT: [[BIN_RDX3]] = add i32 [[RDX_1]], [[TMP1]]
958+
; APPLE-NEXT: [[RDX_2_NEXT_1:%.*]] = mul i32 [[RDX_2_NEXT]], [[TMP1]]
959+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
960+
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
961+
; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
962+
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_21]], [[TMP2]]
963+
; APPLE-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_NEXT_1]], [[TMP2]]
964+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
965+
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
966+
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
967+
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]]
968+
; APPLE-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[TMP3]]
969+
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
970+
; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
971+
; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
936972
; APPLE: [[EXIT]]:
937-
; APPLE-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
938-
; APPLE-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ]
973+
; APPLE-NEXT: [[RES_1:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
974+
; APPLE-NEXT: [[RES_3:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
939975
; APPLE-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]]
940-
; APPLE-NEXT: ret i32 [[SUM]]
976+
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[SUM]]
977+
; APPLE-NEXT: [[BIN_RDX4:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
978+
; APPLE-NEXT: [[SUM1:%.*]] = add i32 [[BIN_RDX4]], [[RES_3]]
979+
; APPLE-NEXT: ret i32 [[SUM1]]
941980
;
942981
; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
943982
; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
@@ -992,18 +1031,72 @@ define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) {
9921031
; APPLE-LABEL: define i32 @test_add_reduction_runtime(
9931032
; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
9941033
; APPLE-NEXT: [[ENTRY:.*]]:
1034+
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
1035+
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 3
1036+
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
1037+
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
1038+
; APPLE: [[ENTRY_NEW]]:
1039+
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
9951040
; APPLE-NEXT: br label %[[LOOP:.*]]
9961041
; APPLE: [[LOOP]]:
997-
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
998-
; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ]
1042+
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
1043+
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
1044+
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
1045+
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
1046+
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
1047+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ]
9991048
; APPLE-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
10001049
; APPLE-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
1001-
; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
1002-
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
1050+
; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP6]]
1051+
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
1052+
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
1053+
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
1054+
; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]]
1055+
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
1056+
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
1057+
; APPLE-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
1058+
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP4]]
1059+
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
1060+
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
1061+
; APPLE-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
1062+
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP5]]
1063+
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV_EPIL]], 4
1064+
; APPLE-NEXT: [[NITER_NEXT_3]] = add nuw i64 [[NITER]], 4
1065+
; APPLE-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
1066+
; APPLE-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
1067+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
1068+
; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
1069+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ]
1070+
; APPLE-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
1071+
; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
1072+
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
1073+
; APPLE-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
1074+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
1075+
; APPLE: [[EXIT_UNR_LCSSA]]:
1076+
; APPLE-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
1077+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
1078+
; APPLE-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
1079+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
1080+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
1081+
; APPLE: [[LOOP_EPIL_PREHEADER]]:
1082+
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
1083+
; APPLE: [[LOOP_EPIL]]:
1084+
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
1085+
; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
1086+
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
1087+
; APPLE-NEXT: [[GEP_A_EPIL1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL1]]
1088+
; APPLE-NEXT: [[TMP7:%.*]] = load i32, ptr [[GEP_A_EPIL1]], align 2
1089+
; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP7]]
1090+
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL1]], 1
10031091
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
1004-
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
1092+
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
1093+
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
1094+
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP3:![0-9]+]]
1095+
; APPLE: [[EXIT_EPILOG_LCSSA]]:
1096+
; APPLE-NEXT: [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
1097+
; APPLE-NEXT: br label %[[EXIT]]
10051098
; APPLE: [[EXIT]]:
1006-
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ]
1099+
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
10071100
; APPLE-NEXT: ret i32 [[RES]]
10081101
;
10091102
; OTHER-LABEL: define i32 @test_add_reduction_runtime(
@@ -1092,6 +1185,7 @@ exit:
10921185
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
10931186
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
10941187
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
1188+
; APPLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
10951189
;.
10961190
; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
10971191
; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"}

0 commit comments

Comments
 (0)