Skip to content

Commit

Permalink
Revert "Reapply "[SLP] Schedule only sub-graph of vectorizable instru…
Browse files Browse the repository at this point in the history
…ctions"""

This reverts commit 7380427. A second, apparently separate, issue has been reported on the original review.
  • Loading branch information
preames committed Mar 3, 2022
1 parent 6f9640d commit deae979
Show file tree
Hide file tree
Showing 121 changed files with 4,114 additions and 4,117 deletions.
25 changes: 11 additions & 14 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Expand Up @@ -2779,8 +2779,7 @@ class BoUpSLP {
}
// Handle the memory dependencies.
for (ScheduleData *MemoryDepSD : BundleMember->MemoryDependencies) {
if (MemoryDepSD->hasValidDependencies() &&
MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
if (MemoryDepSD->incrementUnscheduledDeps(-1) == 0) {
// There are no more unscheduled dependencies after decrementing,
// so we can put the dependent instruction into the ready list.
ScheduleData *DepBundle = MemoryDepSD->FirstInBundle;
Expand Down Expand Up @@ -2837,8 +2836,7 @@ class BoUpSLP {
void initialFillReadyList(ReadyListType &ReadyList) {
for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) {
doForAllOpcodes(I, [&](ScheduleData *SD) {
if (SD->isSchedulingEntity() && SD->hasValidDependencies() &&
SD->isReady()) {
if (SD->isSchedulingEntity() && SD->isReady()) {
ReadyList.insert(SD);
LLVM_DEBUG(dbgs()
<< "SLP: initially in ready list: " << *SD << "\n");
Expand Down Expand Up @@ -7985,11 +7983,6 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {

LLVM_DEBUG(dbgs() << "SLP: schedule block " << BS->BB->getName() << "\n");

// A key point - if we got here, pre-scheduling was able to find a valid
// scheduling of the sub-graph of the scheduling window which consists
// of all vector bundles and their transitive users. As such, we do not
// need to reschedule anything *outside of* that subgraph.

BS->resetSchedule();

// For the real scheduling we use a more sophisticated ready-list: it is
Expand All @@ -8002,19 +7995,21 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
};
std::set<ScheduleData *, ScheduleDataCompare> ReadyInsts;

// Ensure that all dependency data is updated (for nodes in the sub-graph)
// and fill the ready-list with initial instructions.
// Ensure that all dependency data is updated and fill the ready-list with
// initial instructions.
int Idx = 0;
int NumToSchedule = 0;
for (auto *I = BS->ScheduleStart; I != BS->ScheduleEnd;
I = I->getNextNode()) {
BS->doForAllOpcodes(I, [this, &Idx, BS](ScheduleData *SD) {
BS->doForAllOpcodes(I, [this, &Idx, &NumToSchedule, BS](ScheduleData *SD) {
assert((isVectorLikeInstWithConstOps(SD->Inst) ||
SD->isPartOfBundle() == (getTreeEntry(SD->Inst) != nullptr)) &&
"scheduler and vectorizer bundle mismatch");
SD->FirstInBundle->SchedulingPriority = Idx++;

if (SD->isSchedulingEntity() && SD->isPartOfBundle())
if (SD->isSchedulingEntity()) {
BS->calculateDependencies(SD, false, this);
NumToSchedule++;
}
});
}
BS->initialFillReadyList(ReadyInsts);
Expand All @@ -8037,7 +8032,9 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
}

BS->schedule(picked, ReadyInsts);
NumToSchedule--;
}
assert(NumToSchedule == 0 && "could not schedule all instructions");

// Check that we didn't break any of our invariants.
#ifdef EXPENSIVE_CHECKS
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll
Expand Up @@ -9,11 +9,11 @@ define void @f(float* %r, float* %w) {
; CHECK-LABEL: @f(
; CHECK-NEXT: [[R0:%.*]] = getelementptr inbounds float, float* [[R:%.*]], i64 0
; CHECK-NEXT: [[R1:%.*]] = getelementptr inbounds float, float* [[R]], i64 1
; CHECK-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
; CHECK-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[R0]] to <2 x float>*
; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, <2 x float>* [[TMP1]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP2]], [[TMP2]]
; CHECK-NEXT: [[W0:%.*]] = getelementptr inbounds float, float* [[W:%.*]], i64 0
; CHECK-NEXT: [[W1:%.*]] = getelementptr inbounds float, float* [[W]], i64 1
; CHECK-NEXT: [[TMP4:%.*]] = bitcast float* [[W0]] to <2 x float>*
; CHECK-NEXT: store <2 x float> [[TMP3]], <2 x float>* [[TMP4]], align 4
; CHECK-NEXT: ret void
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
Expand Up @@ -12,9 +12,9 @@ define void @test1(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]]
; CHECK: for.body3.lr.ph:
; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
Expand Down Expand Up @@ -57,9 +57,9 @@ define void @test2(%structA* nocapture readonly %J, i32 %xmin, i32 %ymin) {
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
; CHECK-NEXT: br label [[FOR_BODY3_LR_PH:%.*]]
; CHECK: for.body3.lr.ph:
; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], %structA* [[J:%.*]], i64 0, i32 0, i64 0
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA]], %structA* [[J]], i64 0, i32 0, i64 1
; CHECK-NEXT: [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
; CHECK-NEXT: [[TMP3:%.*]] = bitcast float* [[ARRAYIDX4]] to <2 x float>*
; CHECK-NEXT: [[TMP4:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
Expand Down
8 changes: 4 additions & 4 deletions llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
Expand Up @@ -36,7 +36,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
Expand Down Expand Up @@ -86,6 +85,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
Expand All @@ -111,7 +111,6 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
Expand Down Expand Up @@ -161,6 +160,7 @@ define i32 @gather_reduce_8x16_i32(i16* nocapture readonly %a, i16* nocapture re
; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
Expand Down Expand Up @@ -297,7 +297,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
; GENERIC-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; GENERIC-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; GENERIC-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
Expand Down Expand Up @@ -347,6 +346,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
; GENERIC-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; GENERIC-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; GENERIC-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; GENERIC-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; GENERIC-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; GENERIC-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; GENERIC-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
Expand All @@ -372,7 +372,6 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
; KRYO-NEXT: [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[A_ADDR_0101:%.*]] = phi i16* [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP0:%.*]] = bitcast i16* [[A_ADDR_0101]] to <8 x i16>*
; KRYO-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
; KRYO-NEXT: [[TMP2:%.*]] = zext <8 x i16> [[TMP1]] to <8 x i32>
Expand Down Expand Up @@ -422,6 +421,7 @@ define i32 @gather_reduce_8x16_i64(i16* nocapture readonly %a, i16* nocapture re
; KRYO-NEXT: [[TMP27:%.*]] = load i16, i16* [[ARRAYIDX55]], align 2
; KRYO-NEXT: [[CONV56:%.*]] = zext i16 [[TMP27]] to i32
; KRYO-NEXT: [[ADD57:%.*]] = add nsw i32 [[ADD48]], [[CONV56]]
; KRYO-NEXT: [[INCDEC_PTR58]] = getelementptr inbounds i16, i16* [[A_ADDR_0101]], i64 8
; KRYO-NEXT: [[TMP28:%.*]] = extractelement <8 x i32> [[TMP6]], i64 7
; KRYO-NEXT: [[TMP29:%.*]] = sext i32 [[TMP28]] to i64
; KRYO-NEXT: [[ARRAYIDX64:%.*]] = getelementptr inbounds i16, i16* [[G]], i64 [[TMP29]]
Expand Down
6 changes: 3 additions & 3 deletions llvm/test/Transforms/SLPVectorizer/AArch64/horizontal.ll
Expand Up @@ -37,9 +37,9 @@ define i32 @test_select(i32* noalias nocapture readonly %blk1, i32* noalias noca
; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 2
; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 2
; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[P1_023]], i64 3
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_023]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[ARRAYIDX21:%.*]] = getelementptr inbounds i32, i32* [[P2_024]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_024]] to <4 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = sub nsw <4 x i32> [[TMP1]], [[TMP3]]
Expand Down Expand Up @@ -163,9 +163,9 @@ define i32 @reduction_with_br(i32* noalias nocapture readonly %blk1, i32* noalia
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 2
; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 2
; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, i32* [[P1_017]], i64 3
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[P1_017]] to <4 x i32>*
; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4
; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds i32, i32* [[P2_018]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i32* [[P2_018]] to <4 x i32>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, <4 x i32>* [[TMP2]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = mul nsw <4 x i32> [[TMP3]], [[TMP1]]
Expand Down Expand Up @@ -274,10 +274,10 @@ define i32 @test_unrolled_select(i8* noalias nocapture readonly %blk1, i8* noali
; CHECK-NEXT: [[ARRAYIDX61:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 6
; CHECK-NEXT: [[ARRAYIDX63:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 6
; CHECK-NEXT: [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, i8* [[P1_044]], i64 7
; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
; CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[P1_044]] to <8 x i8>*
; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1
; CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i32>
; CHECK-NEXT: [[ARRAYIDX74:%.*]] = getelementptr inbounds i8, i8* [[P2_045]], i64 7
; CHECK-NEXT: [[TMP3:%.*]] = bitcast i8* [[P2_045]] to <8 x i8>*
; CHECK-NEXT: [[TMP4:%.*]] = load <8 x i8>, <8 x i8>* [[TMP3]], align 1
; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i8> [[TMP4]] to <8 x i32>
Expand Down
2 changes: 1 addition & 1 deletion llvm/test/Transforms/SLPVectorizer/AArch64/loadi8.ll
Expand Up @@ -18,7 +18,6 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 2
; CHECK-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 2
; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 3
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8* [[SRC]] to <4 x i8>*
; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, <4 x i8>* [[TMP2]], align 1
; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32>
Expand All @@ -33,6 +32,7 @@ define void @f_noalias(i8* noalias nocapture %dst, i8* noalias nocapture readonl
; CHECK-NEXT: [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
; CHECK-NEXT: [[TMP12:%.*]] = select <4 x i1> [[TMP9]], <4 x i32> [[TMP8]], <4 x i32> [[TMP11]]
; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i32> [[TMP12]] to <4 x i8>
; CHECK-NEXT: [[ARRAYIDX2_3:%.*]] = getelementptr inbounds i8, i8* [[DST]], i64 3
; CHECK-NEXT: [[TMP14:%.*]] = bitcast i8* [[DST]] to <4 x i8>*
; CHECK-NEXT: store <4 x i8> [[TMP13]], <4 x i8>* [[TMP14]], align 1
; CHECK-NEXT: ret void
Expand Down

0 comments on commit deae979

Please sign in to comment.