Skip to content

Commit

Permalink
[LV] Fix incorrectly marking a pointer indvar as 'scalar'.
Browse files Browse the repository at this point in the history
collectLoopScalars should only add non-uniform nodes to the list if they
are used by a load/store instruction that is marked as CM_Scalarize.

Before this patch, the LV incorrectly marked pointer induction variables
as 'scalar' when they required to be widened by something else,
such as a compare instruction, and weren't used by a node marked as
'CM_Scalarize'. This case is covered by sve-widen-phi.ll.

This change also allows removing some code where the LV tried to
widen the PHI nodes with a stepvector, even though it was marked as
'scalarAfterVectorization'. Now that this code is more careful about
marking instructions that need widening as 'scalar', this code has
become redundant.

Differential Revision: https://reviews.llvm.org/D114373
  • Loading branch information
sdesmalen-arm committed Nov 28, 2021
1 parent a9f837b commit 28a4dea
Show file tree
Hide file tree
Showing 6 changed files with 235 additions and 199 deletions.
89 changes: 26 additions & 63 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Expand Up @@ -1915,9 +1915,11 @@ class LoopVectorizationCostModel {

/// Collect the instructions that are scalar after vectorization. An
/// instruction is scalar if it is known to be uniform or will be scalarized
/// during vectorization. Non-uniform scalarized instructions will be
/// represented by VF values in the vectorized loop, each corresponding to an
/// iteration of the original scalar loop.
/// during vectorization. collectLoopScalars should only add non-uniform nodes
/// to the list if they are used by a load/store instruction that is marked as
/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
/// VF values in the vectorized loop, each corresponding to an iteration of
/// the original scalar loop.
void collectLoopScalars(ElementCount VF);

/// Keeps cost model vectorization decision and cost for instructions.
Expand Down Expand Up @@ -4862,38 +4864,14 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN,
// iteration. If the instruction is uniform, we only need to generate the
// first lane. Otherwise, we generate all VF values.
bool IsUniform = Cost->isUniformAfterVectorization(P, State.VF);
unsigned Lanes = IsUniform ? 1 : State.VF.getKnownMinValue();

bool NeedsVectorIndex = !IsUniform && VF.isScalable();
Value *UnitStepVec = nullptr, *PtrIndSplat = nullptr;
if (NeedsVectorIndex) {
Type *VecIVTy = VectorType::get(PtrInd->getType(), VF);
UnitStepVec = Builder.CreateStepVector(VecIVTy);
PtrIndSplat = Builder.CreateVectorSplat(VF, PtrInd);
}
assert((IsUniform || !State.VF.isScalable()) &&
"Cannot scalarize a scalable VF");
unsigned Lanes = IsUniform ? 1 : State.VF.getFixedValue();

for (unsigned Part = 0; Part < UF; ++Part) {
Value *PartStart =
createStepForVF(Builder, PtrInd->getType(), VF, Part);

if (NeedsVectorIndex) {
// Here we cache the whole vector, which means we can support the
// extraction of any lane. However, in some cases the extractelement
// instruction that is generated for scalar uses of this vector (e.g.
// a load instruction) is not folded away. Therefore we still
// calculate values for the first n lanes to avoid redundant moves
// (when extracting the 0th element) and to produce scalar code (i.e.
// additional add/gep instructions instead of expensive extractelement
// instructions) when extracting higher-order elements.
Value *PartStartSplat = Builder.CreateVectorSplat(VF, PartStart);
Value *Indices = Builder.CreateAdd(PartStartSplat, UnitStepVec);
Value *GlobalIndices = Builder.CreateAdd(PtrIndSplat, Indices);
Value *SclrGep =
emitTransformedIndex(Builder, GlobalIndices, PSE.getSE(), DL, II);
SclrGep->setName("next.gep");
State.set(PhiR, SclrGep, Part);
}

for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
Value *Idx = Builder.CreateAdd(
PartStart, ConstantInt::get(PtrInd->getType(), Lane));
Expand Down Expand Up @@ -5229,38 +5207,11 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
!TheLoop->isLoopInvariant(V);
};

auto isScalarPtrInduction = [&](Instruction *MemAccess, Value *Ptr) {
if (!isa<PHINode>(Ptr) ||
!Legal->getInductionVars().count(cast<PHINode>(Ptr)))
return false;
auto &Induction = Legal->getInductionVars()[cast<PHINode>(Ptr)];
if (Induction.getKind() != InductionDescriptor::IK_PtrInduction)
return false;
return isScalarUse(MemAccess, Ptr);
};

// A helper that evaluates a memory access's use of a pointer. If the
// pointer is actually the pointer induction of a loop, it is being
// inserted into Worklist. If the use will be a scalar use, and the
// pointer is only used by memory accesses, we place the pointer in
// ScalarPtrs. Otherwise, the pointer is placed in PossibleNonScalarPtrs.
// A helper that evaluates a memory access's use of a pointer. If the use will
// be a scalar use and the pointer is only used by memory accesses, we place
// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
// PossibleNonScalarPtrs.
auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
if (isScalarPtrInduction(MemAccess, Ptr)) {
Worklist.insert(cast<Instruction>(Ptr));
LLVM_DEBUG(dbgs() << "LV: Found new scalar instruction: " << *Ptr
<< "\n");

Instruction *Update = cast<Instruction>(
cast<PHINode>(Ptr)->getIncomingValueForBlock(Latch));

// If there is more than one user of Update (Ptr), we shouldn't assume it
// will be scalar after vectorisation as other users of the instruction
// may require widening. Otherwise, add it to ScalarPtrs.
if (Update->hasOneUse() && cast<Value>(*Update->user_begin()) == Ptr) {
ScalarPtrs.insert(Update);
return;
}
}
// We only care about bitcast and getelementptr instructions contained in
// the loop.
if (!isLoopVaryingBitCastOrGEP(Ptr))
Expand Down Expand Up @@ -5352,11 +5303,22 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
continue;

// Returns true if \p Indvar is a pointer induction that is used directly by
// load/store instruction \p I.
auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
Instruction *I) {
return Induction.second.getKind() ==
InductionDescriptor::IK_PtrInduction &&
(isa<LoadInst>(I) || isa<StoreInst>(I)) &&
Indvar == getLoadStorePointerOperand(I) && isScalarUse(I, Indvar);
};

// Determine if all users of the induction variable are scalar after
// vectorization.
auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I) ||
IsDirectLoadStoreFromPtrIndvar(Ind, I);
});
if (!ScalarInd)
continue;
Expand All @@ -5366,7 +5328,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
auto ScalarIndUpdate =
llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
auto *I = cast<Instruction>(U);
return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
return I == Ind || !TheLoop->contains(I) || Worklist.count(I) ||
IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
});
if (!ScalarIndUpdate)
continue;
Expand Down
Expand Up @@ -90,7 +90,7 @@ for.end:
;
; Same as predicate_store except we use a pointer PHI to maintain the address
;
; CHECK: Found new scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found scalar instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
; CHECK: Found scalar instruction: %addr.next = getelementptr inbounds i32, i32* %addr, i64 1
; CHECK: Scalarizing and predicating: store i32 %tmp2, i32* %addr, align 4
; CHECK: Found an estimated cost of 0 for VF 2 For instruction: %addr = phi i32* [ %a, %entry ], [ %addr.next, %for.inc ]
Expand Down
128 changes: 83 additions & 45 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll
Expand Up @@ -41,35 +41,39 @@ define void @pointer_induction_used_as_vector(i8** noalias %start.1, i8* noalias
; CHECK-NEXT: [[IND_END3:%.*]] = getelementptr i8, i8* [[START_2:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START_2]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8*, i8** [[START_1]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP6:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX]], i32 0
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 1, [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = mul i64 [[TMP7]], 0
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP10]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP7]]
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START_2]], <vscale x 2 x i64> [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, i8* [[START_2]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP4]], i64 1
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = bitcast i8** [[TMP12]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP11]], <vscale x 2 x i8*>* [[TMP13]], align 8
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, i8* [[NEXT_GEP5]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP15]], align 1
; CHECK-NEXT: [[TMP16:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP17:%.*]] = bitcast i8* [[TMP14]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP16]], <vscale x 2 x i8>* [[TMP17]], align 1
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP12:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP11]]
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP12]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP13]], i64 1
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8*, i8** [[NEXT_GEP]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8** [[TMP15]] to <vscale x 2 x i8*>*
; CHECK-NEXT: store <vscale x 2 x i8*> [[TMP14]], <vscale x 2 x i8*>* [[TMP16]], align 8
; CHECK-NEXT: [[TMP17:%.*]] = extractelement <vscale x 2 x i8*> [[TMP13]], i32 0
; CHECK-NEXT: [[TMP18:%.*]] = getelementptr i8, i8* [[TMP17]], i32 0
; CHECK-NEXT: [[TMP19:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP19]], align 1
; CHECK-NEXT: [[TMP20:%.*]] = add <vscale x 2 x i8> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i8> insertelement (<vscale x 2 x i8> poison, i8 1, i32 0), <vscale x 2 x i8> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP21:%.*]] = bitcast i8* [[TMP18]] to <vscale x 2 x i8>*
; CHECK-NEXT: store <vscale x 2 x i8> [[TMP20]], <vscale x 2 x i8>* [[TMP21]], align 1
; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP23:%.*]] = mul i64 [[TMP22]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP23]]
; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP9]]
; CHECK-NEXT: br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
Expand Down Expand Up @@ -119,32 +123,66 @@ exit: ; preds = %loop.body

define void @pointer_induction(i8* noalias %start, i64 %N) {
; CHECK-LABEL: @pointer_induction(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N:%.*]], 1
; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP2:%.*]] = mul i64 [[TMP1]], 2
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
; CHECK: vector.ph:
; CHECK: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START:%.*]], i32 0
; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP4:%.*]] = mul i64 [[TMP3]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, i8* [[START:%.*]], i64 [[N_VEC]]
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i8*> poison, i8* [[START]], i32 0
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i8*> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i8*> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[POINTER_PHI:%.*]] = phi i8* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP5:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[INDEX1]], i32 0
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 2
; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 1
; CHECK-NEXT: [[TMP8:%.*]] = mul i64 1, [[TMP7]]
; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP6]], 0
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP9]], i32 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[DOTSPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
; CHECK-NEXT: [[TMP6:%.*]] = add <vscale x 2 x i64> zeroinitializer, [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP6]]
; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[START]], <vscale x 2 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], 1
; CHECK-NEXT: [[NEXT_GEP4:%.*]] = getelementptr i8, i8* [[START]], i64 [[TMP9]]
; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP3]], i32 0
; CHECK-NEXT: [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP12]], align 1
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[NEXT_GEP]], i64 1
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP13]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP16]]
; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK-NEXT: [[TMP10:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
; CHECK-NEXT: [[TMP11:%.*]] = add <vscale x 2 x i64> [[DOTSPLAT]], [[TMP10]]
; CHECK-NEXT: [[VECTOR_GEP:%.*]] = mul <vscale x 2 x i64> [[TMP11]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i32 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, i8* [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
; CHECK-NEXT: [[TMP13:%.*]] = add i64 [[INDEX1]], 0
; CHECK-NEXT: [[TMP14:%.*]] = extractelement <vscale x 2 x i8*> [[TMP12]], i32 0
; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, i8* [[TMP14]], i32 0
; CHECK-NEXT: [[TMP16:%.*]] = bitcast i8* [[TMP15]] to <vscale x 2 x i8>*
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, <vscale x 2 x i8>* [[TMP16]], align 1
; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, <vscale x 2 x i8*> [[TMP12]], i64 1
; CHECK-NEXT: [[TMP18:%.*]] = icmp eq <vscale x 2 x i8*> [[TMP17]], [[BROADCAST_SPLAT]]
; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP20:%.*]] = mul i64 [[TMP19]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP20]]
; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, i8* [[POINTER_PHI]], i64 [[TMP8]]
; CHECK-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
; CHECK-NEXT: br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
; CHECK-NEXT: [[BC_RESUME_VAL2:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: br label [[FOR_BODY:%.*]]
; CHECK: for.body:
; CHECK-NEXT: [[PTR_PHI:%.*]] = phi i8* [ [[PTR_PHI_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INDEX_NXT]] = add i64 [[INDEX]], 1
; CHECK-NEXT: [[TMP22:%.*]] = load i8, i8* [[PTR_PHI]], align 1
; CHECK-NEXT: [[PTR_PHI_NEXT]] = getelementptr inbounds i8, i8* [[PTR_PHI]], i64 1
; CHECK-NEXT: [[CMP_I_NOT:%.*]] = icmp eq i8* [[PTR_PHI_NEXT]], [[START]]
; CHECK-NEXT: [[CMP:%.*]] = icmp ult i64 [[INDEX]], [[N]]
; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP5:![0-9]+]]
; CHECK: end:
; CHECK-NEXT: ret void
;
entry:
br label %for.body
Expand Down

0 comments on commit 28a4dea

Please sign in to comment.