Skip to content

Commit

Permalink
[LV] Create & use VPScalarIVSteps for all scalar users.
Browse files Browse the repository at this point in the history
This patch is a follow-up to D115953. It updates optimizeInductions
to also introduce new VPScalarIVStepsRecipes if an IV has both vector
and scalar uses.

It updates all uses that only need scalar values to use the newly
created recipe for the scalar steps.

This completes untangling of VPWidenIntOrFpInductionRecipe
code-generation. Now the recipe *only* creates the widened vector
values, as it says on the tin.

The code to genereate IR has been moved directly to
VPWidenIntOrFpInductionRecipe::execute.

Note that the recipe has been updated to hold a reference to
ScalarEvolution, which is needed to expand the step, until we can place
the corresponding SCEV expansion in the pre-header.

Depends on D120827.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D120828
  • Loading branch information
fhahn committed Mar 13, 2022
1 parent 665879b commit 95f76bf
Show file tree
Hide file tree
Showing 28 changed files with 561 additions and 899 deletions.
309 changes: 122 additions & 187 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Large diffs are not rendered by default.

16 changes: 11 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Expand Up @@ -1062,28 +1062,34 @@ class VPWidenGEPRecipe : public VPRecipeBase, public VPValue {
};

/// A recipe for handling phi nodes of integer and floating-point inductions,
/// producing their vector and scalar values.
/// producing their vector values.
class VPWidenIntOrFpInductionRecipe : public VPRecipeBase, public VPValue {
PHINode *IV;
const InductionDescriptor &IndDesc;
bool NeedsScalarIV;
bool NeedsVectorIV;

/// SCEV used to expand step.
/// FIXME: move expansion of step to the pre-header, once it is modeled
/// explicitly.
ScalarEvolution &SE;

public:
VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
bool NeedsScalarIV, bool NeedsVectorIV)
bool NeedsScalarIV, bool NeedsVectorIV,
ScalarEvolution &SE)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(IV, this),
IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
NeedsVectorIV(NeedsVectorIV) {}
NeedsVectorIV(NeedsVectorIV), SE(SE) {}

VPWidenIntOrFpInductionRecipe(PHINode *IV, VPValue *Start,
const InductionDescriptor &IndDesc,
TruncInst *Trunc, bool NeedsScalarIV,
bool NeedsVectorIV)
bool NeedsVectorIV, ScalarEvolution &SE)
: VPRecipeBase(VPWidenIntOrFpInductionSC, {Start}), VPValue(Trunc, this),
IV(IV), IndDesc(IndDesc), NeedsScalarIV(NeedsScalarIV),
NeedsVectorIV(NeedsVectorIV) {}
NeedsVectorIV(NeedsVectorIV), SE(SE) {}

~VPWidenIntOrFpInductionRecipe() override = default;

Expand Down
27 changes: 23 additions & 4 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Expand Up @@ -47,8 +47,8 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
auto *Phi = cast<PHINode>(VPPhi->getUnderlyingValue());
if (const auto *II = GetIntOrFpInductionDescriptor(Phi)) {
VPValue *Start = Plan->getOrAddVPValue(II->getStartValue());
NewRecipe =
new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false, true);
NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi, Start, *II, false,
true, SE);
} else {
Plan->addVPValue(Phi, VPPhi);
continue;
Expand Down Expand Up @@ -402,7 +402,7 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
for (VPRecipeBase &Phi : HeaderVPBB->phis()) {
auto *IV = dyn_cast<VPWidenIntOrFpInductionRecipe>(&Phi);
if (!IV || IV->needsVectorIV())
if (!IV || !IV->needsScalarIV())
continue;

const InductionDescriptor &ID = IV->getInductionDescriptor();
Expand Down Expand Up @@ -430,6 +430,25 @@ void VPlanTransforms::optimizeInductions(VPlan &Plan, ScalarEvolution &SE) {
HeaderVPBB->insert(cast<VPRecipeBase>(Step->getDef()),
HeaderVPBB->getFirstNonPhi());
}
IV->replaceAllUsesWith(Steps);

// If there are no vector users of IV, simply update all users to use Step
// instead.
if (!IV->needsVectorIV()) {
IV->replaceAllUsesWith(Steps);
continue;
}

// Otherwise only update scalar users of IV to use Step instead.
SmallVector<VPUser *> Users(IV->user_begin(), IV->user_end());
for (VPUser *U : Users) {
VPRecipeBase *R = cast<VPRecipeBase>(U);
if (!R->usesScalars(IV))
continue;
for (unsigned I = 0, E = R->getNumOperands(); I != E; I++) {
if (R->getOperand(I) != IV)
continue;
R->setOperand(I, Steps);
}
}
}
}
6 changes: 4 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.h
Expand Up @@ -55,8 +55,10 @@ struct VPlanTransforms {
/// removed.
static void removeDeadRecipes(VPlan &Plan, Loop &OrigLoop);

// If all users of a vector IV need scalar values, provide them by building
// scalar steps off of the canonical scalar IV, and remove the vector IV.
/// If any user of a VPWidenIntOrFpInductionRecipe needs scalar values,
/// provide them by building scalar steps off of the canonical scalar IV and
/// update the original IV's users. This is an optional optimization to reduce
/// the needs of vector extracts.
static void optimizeInductions(VPlan &Plan, ScalarEvolution &SE);
};

Expand Down
62 changes: 30 additions & 32 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll
Expand Up @@ -303,39 +303,37 @@ define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a,
; CHECK-NEXT: [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 2
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP6]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; CHECK-NEXT: [[DOTSPLATINSERT2:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[DOTSPLAT3:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT2]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT3]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP7]], i64 0
; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT4]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT5]], [[TMP4]]
; CHECK-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT3]], [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP5]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP11:%.*]] = shl <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[TMP10]]
; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP11]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER6:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP15:%.*]] = bitcast float* [[TMP14]] to <vscale x 4 x float>*
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], <vscale x 4 x float>* [[TMP15]], align 4
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i32 [[TMP16]], 2
; CHECK-NEXT: [[TMP18:%.*]] = zext i32 [[TMP17]] to i64
; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, float* [[TMP14]], i64 [[TMP18]]
; CHECK-NEXT: [[TMP20:%.*]] = bitcast float* [[TMP19]] to <vscale x 4 x float>*
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER6]], <vscale x 4 x float>* [[TMP20]], align 4
; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP22:%.*]] = shl nuw nsw i64 [[TMP21]], 3
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP22]]
; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[STEP_ADD:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP7:%.*]] = shl <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP8:%.*]] = shl <vscale x 4 x i64> [[STEP_ADD]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i32 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, float* [[B:%.*]], <vscale x 4 x i64> [[TMP7]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, float* [[B]], <vscale x 4 x i64> [[TMP8]]
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP9]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x float> @llvm.masked.gather.nxv4f32.nxv4p0f32(<vscale x 4 x float*> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x float> undef)
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, float* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP12:%.*]] = bitcast float* [[TMP11]] to <vscale x 4 x float>*
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER]], <vscale x 4 x float>* [[TMP12]], align 4
; CHECK-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
; CHECK-NEXT: [[TMP14:%.*]] = shl nuw nsw i32 [[TMP13]], 2
; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, float* [[TMP11]], i64 [[TMP15]]
; CHECK-NEXT: [[TMP17:%.*]] = bitcast float* [[TMP16]] to <vscale x 4 x float>*
; CHECK-NEXT: store <vscale x 4 x float> [[WIDE_MASKED_GATHER2]], <vscale x 4 x float>* [[TMP17]], align 4
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 3
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[STEP_ADD]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
Expand All @@ -346,9 +344,9 @@ define void @gather_nxv4i32_ind64_stride2(float* noalias nocapture readonly %a,
; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
; CHECK-NEXT: [[INDVARS_IV_STRIDE2:%.*]] = shl i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[B]], i64 [[INDVARS_IV_STRIDE2]]
; CHECK-NEXT: [[TMP24:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP21:%.*]] = load float, float* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, float* [[A]], i64 [[INDVARS_IV]]
; CHECK-NEXT: store float [[TMP24]], float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: store float [[TMP21]], float* [[ARRAYIDX2]], align 4
; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]]
; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
Expand Down
39 changes: 21 additions & 18 deletions llvm/test/Transforms/LoopVectorize/AArch64/sve-inductions.ll
Expand Up @@ -22,25 +22,28 @@ define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readon
; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2
; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[TMP5]], 2
; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP6]], i64 0
; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: br label [[VECTOR_BODY:%.*]]
; CHECK: vector.body:
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP4:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
; CHECK-NEXT: [[DOTSPLATINSERT1:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[INDEX]], i64 0
; CHECK-NEXT: [[DOTSPLAT2:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT1]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = add <vscale x 4 x i64> [[DOTSPLAT2]], [[TMP4]]
; CHECK-NEXT: [[TMP6:%.*]] = trunc <vscale x 4 x i64> [[TMP5]] to <vscale x 4 x i1>
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <vscale x 4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP8]], i32 4, <vscale x 4 x i1> [[TMP6]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <vscale x 4 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP10]], i32 4, <vscale x 4 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP12]]
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP7:%.*]] = trunc <vscale x 4 x i64> [[VEC_IND]] to <vscale x 4 x i1>
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, i32* [[B:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <vscale x 4 x i32>*
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.masked.load.nxv4i32.p0nxv4i32(<vscale x 4 x i32>* [[TMP9]], i32 4, <vscale x 4 x i1> [[TMP7]], <vscale x 4 x i32> poison)
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, i32* [[A:%.*]], i64 [[INDEX]]
; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <vscale x 4 x i32>*
; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0nxv4i32(<vscale x 4 x i32> [[WIDE_MASKED_LOAD]], <vscale x 4 x i32>* [[TMP11]], i32 4, <vscale x 4 x i1> [[TMP7]])
; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
; CHECK-NEXT: [[TMP13:%.*]] = shl i64 [[TMP12]], 2
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
Expand All @@ -54,9 +57,9 @@ define void @cond_ind64(i32* noalias nocapture %a, i32* noalias nocapture readon
; CHECK-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
; CHECK: if.then:
; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[I_08]]
; CHECK-NEXT: [[TMP14:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[I_08]]
; CHECK-NEXT: store i32 [[TMP14]], i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX1]], align 4
; CHECK-NEXT: br label [[FOR_INC]]
; CHECK: for.inc:
; CHECK-NEXT: [[INC]] = add nuw nsw i64 [[I_08]], 1
Expand Down

0 comments on commit 95f76bf

Please sign in to comment.