Skip to content

Commit

Permalink
[LoopVectorize] Fix scalarisation crash in widenPHIInstruction for sc…
Browse files Browse the repository at this point in the history
…alable vectors

In InnerLoopVectorizer::widenPHIInstruction there are cases where we have
to scalarise a pointer induction variable after vectorisation. For scalable
vectors we already deal with the case where the pointer induction variable
is uniform, but we currently crash if not uniform. For fixed width vectors
we calculate every lane of the scalarised pointer induction variable for a
given VF, however this cannot work for scalable vectors. In this case I
have added support for caching the whole vector value for each unrolled
part so that we can always extract an arbitrary element. Additionally, we
still continue to cache the known minimum number of lanes too in order
to improve code quality by avoiding an extractelement operation.

I have adapted an existing test `pointer_iv_mixed` from the file:

  Transforms/LoopVectorize/consecutive-ptr-uniforms.ll

and added it here for scalable vectors instead:

  Transforms/LoopVectorize/AArch64/sve-widen-phi.ll

Differential Revision: https://reviews.llvm.org/D101294
  • Loading branch information
david-arm committed May 12, 2021
1 parent 6e6f9a6 commit b7a1127
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 7 deletions.
45 changes: 38 additions & 7 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3335,8 +3335,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
SCEVExpander Exp(*SE, DL, "induction");
auto Step = ID.getStep();
auto StartValue = ID.getStartValue();
assert(Index->getType() == Step->getType() &&
"Index type does not match StepValue type");
assert(Index->getType()->getScalarType() == Step->getType() &&
"Index scalar type does not match StepValue type");

// Note: the IR at this point is broken. We cannot use SE to create any new
// SCEV and then expand it, hoping that SCEV's simplification will give us
Expand All @@ -3355,14 +3355,20 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateAdd(X, Y);
};

// We allow X to be a vector type, in which case Y will potentially be
// splatted into a vector with the same element count.
auto CreateMul = [&B](Value *X, Value *Y) {
assert(X->getType() == Y->getType() && "Types don't match!");
assert(X->getType()->getScalarType() == Y->getType() &&
"Types don't match!");
if (auto *CX = dyn_cast<ConstantInt>(X))
if (CX->isOne())
return Y;
if (auto *CY = dyn_cast<ConstantInt>(Y))
if (CY->isOne())
return X;
VectorType *XVTy = dyn_cast<VectorType>(X->getType());
if (XVTy && !isa<VectorType>(Y->getType()))
Y = B.CreateVectorSplat(XVTy->getElementCount(), Y);
return B.CreateMul(X, Y);
};

Expand All @@ -3381,6 +3387,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(

switch (ID.getKind()) {
case InductionDescriptor::IK_IntInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for integer inductions yet");
assert(Index->getType() == StartValue->getType() &&
"Index type does not match StartValue type");
if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
Expand All @@ -3395,9 +3403,12 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
return B.CreateGEP(
StartValue->getType()->getPointerElementType(), StartValue,
CreateMul(Index,
Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
Exp.expandCodeFor(Step, Index->getType()->getScalarType(),
GetInsertPoint())));
}
case InductionDescriptor::IK_FpInduction: {
assert(!isa<VectorType>(Index->getType()) &&
"Vector indices not supported for FP inductions yet");
assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
auto InductionBinOp = ID.getInductionBinOp();
assert(InductionBinOp &&
Expand Down Expand Up @@ -4816,13 +4827,33 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
assert((IsUniform || !VF.isScalable()) &&
"Currently unsupported for scalable vectors");
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();
unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();

bool NeedsVectorIndex = !IsUniform && VF.isScalable();
Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
if (NeedsVectorIndex) {
Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
UnitStepVec = Builder.CreateStepVector(VecIVTy);
PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
}

for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart = createStepForVF(
Builder, ConstantInt::get(PtrInd->getType(), Part), VF);

if (NeedsVectorIndex) {
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
Value *SclrGep =
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
// We've cached the whole vector, which means we can support the
// extraction of any lane.
continue;
}

for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Expand Down
59 changes: 59 additions & 0 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,68 @@ for.cond.cleanup: ; preds = %for.body
ret void
}


;
; Check multiple pointer induction variables where only one is recognized as
; uniform and remains uniform after vectorization. The other pointer induction
; variable is not recognized as uniform and is not uniform after vectorization
; because it is stored to memory.
;

define i32 @pointer_iv_mixed(i32* noalias %a, i32** noalias %b, i64 %n) {
; CHECK-LABEL: @pointer_iv_mixed(
; CHECK: vector.body
; CHECK: %[[IDX:.*]] = phi i64 [ 0, %vector.ph ], [ %{{.*}}, %vector.body ]
; CHECK: %[[STEPVEC:.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: %[[TMP1:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[IDX]], i32 0
; CHECK-NEXT: %[[TMP2:.*]] = shufflevector <vscale x 2 x i64> %[[TMP1]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: %[[VECIND1:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[STEPVEC]]
; CHECK-NEXT: %[[APTRS1:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND1]]
; CHECK-NEXT: %[[VSCALE64:.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: %[[VSCALE64X2:.*]] = shl i64 %[[VSCALE64]], 1
; CHECK-NEXT: %[[TMP3:.*]] = insertelement <vscale x 2 x i64> poison, i64 %[[VSCALE64X2]], i32 0
; CHECK-NEXT: %[[TMP4:.*]] = shufflevector <vscale x 2 x i64> %[[TMP3]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: %[[TMP5:.*]] = add <vscale x 2 x i64> %[[TMP4]], %[[STEPVEC]]
; CHECK-NEXT: %[[VECIND2:.*]] = add <vscale x 2 x i64> %[[TMP2]], %[[TMP5]]
; CHECK-NEXT: %[[APTRS2:.*]] = getelementptr i32, i32* %a, <vscale x 2 x i64> %[[VECIND2]]
; CHECK-NEXT: %[[GEPB1:.*]] = getelementptr i32*, i32** %b, i64 %[[IDX]]
; CHECK: %[[BPTR1:.*]] = bitcast i32** %[[GEPB1]] to <vscale x 2 x i32*>*
; CHECK-NEXT: store <vscale x 2 x i32*> %[[APTRS1]], <vscale x 2 x i32*>* %[[BPTR1]], align 8
; CHECK: %[[VSCALE32:.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: %[[VSCALE32X2:.*]] = shl i32 %[[VSCALE32]], 1
; CHECK-NEXT: %[[TMP6:.*]] = sext i32 %[[VSCALE32X2]] to i64
; CHECK-NEXT: %[[GEPB2:.*]] = getelementptr i32*, i32** %[[GEPB1]], i64 %[[TMP6]]
; CHECK-NEXT: %[[BPTR2:.*]] = bitcast i32** %[[GEPB2]] to <vscale x 2 x i32*>*
; CHECK-NEXT store <vscale x 2 x i32*> %[[APTRS2]], <vscale x 2 x i32*>* %[[BPTR2]], align 8

entry:
br label %for.body

for.body:
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%p = phi i32* [ %tmp3, %for.body ], [ %a, %entry ]
%q = phi i32** [ %tmp4, %for.body ], [ %b, %entry ]
%tmp0 = phi i32 [ %tmp2, %for.body ], [ 0, %entry ]
%tmp1 = load i32, i32* %p, align 8
%tmp2 = add i32 %tmp1, %tmp0
store i32* %p, i32** %q, align 8
%tmp3 = getelementptr inbounds i32, i32* %p, i32 1
%tmp4 = getelementptr inbounds i32*, i32** %q, i32 1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end, !llvm.loop !6

for.end:
%tmp5 = phi i32 [ %tmp2, %for.body ]
ret i32 %tmp5
}


!0 = distinct !{!0, !1, !2, !3, !4, !5}
!1 = !{!"llvm.loop.mustprogress"}
!2 = !{!"llvm.loop.vectorize.width", i32 4}
!3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}
!4 = !{!"llvm.loop.vectorize.enable", i1 true}
!5 = !{!"llvm.loop.interleave.count", i32 2}
!6 = distinct !{!6, !1, !7, !3, !4, !5}
!7 = !{!"llvm.loop.vectorize.width", i32 2}

0 comments on commit b7a1127

Please sign in to comment.