Skip to content

Commit b7a1127

Browse files
committed
[LoopVectorize] Fix scalarisation crash in widenPHIInstruction for scalable vectors
In InnerLoopVectorizer::widenPHIInstruction there are cases where we have to scalarise a pointer induction variable after vectorisation. For scalable vectors we already deal with the case where the pointer induction variable is uniform, but we currently crash if not uniform. For fixed width vectors we calculate every lane of the scalarised pointer induction variable for a given VF, however this cannot work for scalable vectors. In this case I have added support for caching the whole vector value for each unrolled part so that we can always extract an arbitrary element. Additionally, we still continue to cache the known minimum number of lanes too in order to improve code quality by avoiding an extractelement operation. I have adapted an existing test `pointer_iv_mixed` from the file: Transforms/LoopVectorize/consecutive-ptr-uniforms.ll and added it here for scalable vectors instead: Transforms/LoopVectorize/AArch64/sve-widen-phi.ll Differential Revision: https://reviews.llvm.org/D101294
1 parent 6e6f9a6 commit b7a1127

File tree

2 files changed

+97
-7
lines changed

2 files changed

+97
-7
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 38 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -3335,8 +3335,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33353335
SCEVExpander Exp(*SE, DL, "induction");
33363336
auto Step = ID.getStep();
33373337
auto StartValue = ID.getStartValue();
3338-
assert(Index->getType() == Step->getType() &&
3339-
"Index type does not match StepValue type");
3338+
assert(Index->getType()->getScalarType() == Step->getType() &&
3339+
"Index scalar type does not match StepValue type");
33403340

33413341
// Note: the IR at this point is broken. We cannot use SE to create any new
33423342
// SCEV and then expand it, hoping that SCEV's simplification will give us
@@ -3355,14 +3355,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33553355
return B.CreateAdd(X, Y);
33563356
};
33573357

3358+
// We allow X to be a vector type, in which case Y will potentially be
3359+
// splatted into a vector with the same element count.
33583360
auto CreateMul = [&B](Value *X, Value *Y) {
3359-
assert(X->getType() == Y->getType() && "Types don't match!");
3361+
assert(X->getType()->getScalarType() == Y->getType() &&
3362+
"Types don't match!");
33603363
if (auto *CX = dyn_cast<ConstantInt>(X))
33613364
if (CX->isOne())
33623365
return Y;
33633366
if (auto *CY = dyn_cast<ConstantInt>(Y))
33643367
if (CY->isOne())
33653368
return X;
3369+
VectorType *XVTy = dyn_cast<VectorType>(X->getType());
3370+
if (XVTy && !isa<VectorType>(Y->getType()))
3371+
Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
33663372
return B.CreateMul(X, Y);
33673373
};
33683374

@@ -3381,6 +3387,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33813387

33823388
switch (ID.getKind()) {
33833389
case InductionDescriptor::IK_IntInduction: {
3390+
assert(!isa<VectorType>(Index->getType()) &&
3391+
"Vector indices not supported for integer inductions yet");
33843392
assert(Index->getType() == StartValue->getType() &&
33853393
"Index type does not match StartValue type");
33863394
if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
@@ -3395,9 +3403,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
33953403
return B.CreateGEP(
33963404
StartValue->getType()->getPointerElementType(), StartValue,
33973405
CreateMul(Index,
3398-
Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
3406+
Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
3407+
GetInsertPoint())));
33993408
}
34003409
case InductionDescriptor::IK_FpInduction: {
3410+
assert(!isa<VectorType>(Index->getType()) &&
3411+
"Vector indices not supported for FP inductions yet");
34013412
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
34023413
auto InductionBinOp = ID.getInductionBinOp();
34033414
assert(InductionBinOp &&
@@ -4816,13 +4827,33 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
48164827
// iteration. If the instruction is uniform, we only need to generate the
48174828
// first lane. Otherwise, we generate all VF values.
48184829
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
4819-
assert((IsUniform || !VF.isScalable()) &&
4820-
"Currently unsupported for scalable vectors");
4821-
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
4830+
unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();
4831+
4832+
bool NeedsVectorIndex = !IsUniform && VF.isScalable();
4833+
Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
4834+
if (NeedsVectorIndex) {
4835+
Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
4836+
UnitStepVec = Builder.CreateStepVector(VecIVTy);
4837+
PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
4838+
}
48224839

48234840
for (unsigned Part = 0; Part < UF; ++Part) {
48244841
Value *PartStart = createStepForVF(
48254842
Builder, ConstantInt::get(PtrInd->getType(), Part), VF);
4843+
4844+
if (NeedsVectorIndex) {
4845+
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
4846+
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
4847+
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
4848+
Value *SclrGep =
4849+
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
4850+
SclrGep->setName("next.gep");
4851+
State.set(PhiR, SclrGep, Part);
4852+
// We've cached the whole vector, which means we can support the
4853+
// extraction of any lane.
4854+
continue;
4855+
}
4856+
48264857
for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
48274858
Value *Idx = Builder.CreateAdd(
48284859
PartStart, ConstantInt::get(PtrInd->getType(), Lane));

llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -114,9 +114,68 @@ for.cond.cleanup: ; preds = %for.body
114114
ret void
115115
}
116116

117+
118+
;
119+
; Check multiple pointer induction variables where only one is recognized as
120+
; uniform and remains uniform after vectorization. The other pointer induction
121+
; variable is not recognized as uniform and is not uniform after vectorization
122+
; because it is stored to memory.
123+
;
124+
125+
define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) {
126+
; CHECK-LABEL: @pointer_iv_mixed(
127+
; CHECK: vector.body
128+
; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
129+
; CHECK: %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
130+
; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[IDX]], i32 0
131+
; CHECK-NEXT: %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
132+
; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
133+
; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
134+
; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
135+
; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
136+
; CHECK-NEXT: %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
137+
; CHECK-NEXT: %[[TMP4:.*]] = shufflevector <vscale x 2 x i64> %[[TMP3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
138+
; CHECK-NEXT: %[[TMP5:.*]] = add <vscale x 2 x i64> %[[TMP4]], %[[STEPVEC]]
139+
; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
140+
; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
141+
; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
142+
; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
143+
; CHECK-NEXT: store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
144+
; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()
145+
; CHECK-NEXT: %[[VSCALE32X2:.*]] = shl i32 %[[VSCALE32]], 1
146+
; CHECK-NEXT: %[[TMP6:.*]] = sext i32 %[[VSCALE32X2]] to i64
147+
; CHECK-NEXT: %[[GEPB2:.*]] = getelementptr i32*, i32** %[[GEPB1]], i64 %[[TMP6]]
148+
; CHECK-NEXT: %[[BPTR2:.*]] = bitcast i32** %[[GEPB2]] to <vscale x 2 x i32*>*
149+
; CHECK-NEXT store <vscale x 2 x i32*> %[[APTRS2]], <vscale x 2 x i32*>* %[[BPTR2]], align 8
150+
151+
entry:
152+
br label %for.body
153+
154+
for.body:
155+
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
156+
%p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
157+
%q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
158+
%tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
159+
%tmp1 = load i32, i32* %p, align 8
160+
%tmp2 = add i32 %tmp1, %tmp0
161+
store i32* %p, i32** %q, align 8
162+
%tmp3 = getelementptr inbounds i32, i32* %p, i32 1
163+
%tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
164+
%i.next = add nuw nsw i64 %i, 1
165+
%cond = icmp slt i64 %i.next, %n
166+
br i1 %cond, label %for.body, label %for.end, !llvm.loop !6
167+
168+
for.end:
169+
%tmp5 = phi i32 [ %tmp2, %for.body ]
170+
ret i32 %tmp5
171+
}
172+
173+
117174
!0 = distinct !{!0, !1, !2, !3, !4, !5}
118175
!1 = !{!"llvm.loop.mustprogress"}
119176
!2 = !{!"llvm.loop.vectorize.width", i32 4}
120177
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
121178
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
122179
!5 = !{!"llvm.loop.interleave.count", i32 2}
180+
!6 = distinct !{!6, !1, !7, !3, !4, !5}
181+
!7 = !{!"llvm.loop.vectorize.width", i32 2}

0 commit comments

Comments
 (0)