Skip to content

Commit

Permalink
[VPlan] Fold (MUL A, 1) -> A as VPlan2VPlan transform.
Browse files Browse the repository at this point in the history
Add first VPlan-based recipe simplification to fold (MUL A, 1) -> A.
Among other things, this enables additional simplifications after
applying versioned strides, as follow up to D147783.

Reviewed By: Ayal

Differential Revision: https://reviews.llvm.org/D159200
  • Loading branch information
fhahn committed Sep 18, 2023
1 parent 4898c33 commit f108c6c
Show file tree
Hide file tree
Showing 4 changed files with 100 additions and 62 deletions.
44 changes: 44 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -799,11 +799,55 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
}
}

/// Returns true is \p V is constant one.
static bool isConstantOne(VPValue *V) {
if (!V->isLiveIn())
return false;
auto *C = dyn_cast<ConstantInt>(V->getLiveInIRValue());
return C && C->isOne();
}

/// Returns the llvm::Instruction opcode for \p R.
static unsigned getOpcodeForRecipe(VPRecipeBase &R) {
if (auto *WidenR = dyn_cast<VPWidenRecipe>(&R))
return WidenR->getUnderlyingInstr()->getOpcode();
if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R))
return RepR->getUnderlyingInstr()->getOpcode();
if (auto *VPI = dyn_cast<VPInstruction>(&R))
return VPI->getOpcode();
return 0;
}

/// Try to simplify recipe \p R.
static void simplifyRecipe(VPRecipeBase &R) {
unsigned Opcode = getOpcodeForRecipe(R);
if (Opcode == Instruction::Mul) {
VPValue *A = R.getOperand(0);
VPValue *B = R.getOperand(1);
if (isConstantOne(A))
return R.getVPSingleValue()->replaceAllUsesWith(B);
if (isConstantOne(B))
return R.getVPSingleValue()->replaceAllUsesWith(A);
}
}

/// Try to simplify the recipes in \p Plan.
static void simplifyRecipes(VPlan &Plan) {
ReversePostOrderTraversal<VPBlockDeepTraversalWrapper<VPBlockBase *>> RPOT(
Plan.getEntry());
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(RPOT)) {
for (VPRecipeBase &R : make_early_inc_range(*VPBB)) {
simplifyRecipe(R);
}
}
}

void VPlanTransforms::optimize(VPlan &Plan, ScalarEvolution &SE) {
removeRedundantCanonicalIVs(Plan);
removeRedundantInductionCasts(Plan);

optimizeInductions(Plan, SE);
simplifyRecipes(Plan);
removeDeadRecipes(Plan);

createAndOptimizeReplicateRegions(Plan);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,17 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
Expand All @@ -38,8 +37,8 @@ define void @test_stride1_4i32(ptr readonly %data, ptr noalias nocapture %dst, i
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], 1
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
Expand Down Expand Up @@ -341,18 +340,17 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur
; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0
; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP0]], i32 [[N]])
; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i32 [[TMP0]], 1
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 2
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP2]]
; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP4]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP5]], ptr [[TMP7]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i32 [[TMP0]], 2
; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i32 [[TMP1]]
; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0
; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP3]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> poison)
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> <i32 5, i32 5, i32 5, i32 5>, [[WIDE_MASKED_LOAD]]
; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i32 [[TMP0]]
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> [[TMP4]], ptr [[TMP6]], i32 4, <4 x i1> [[ACTIVE_LANE_MASK]])
; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4
; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 true, label [[END:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
Expand All @@ -363,8 +361,8 @@ define void @test_stride_loopinvar_4i32(ptr readonly %data, ptr noalias nocaptur
; CHECK-NEXT: [[MUL:%.*]] = mul nuw nsw i32 [[I_023]], [[STRIDE]]
; CHECK-NEXT: [[ADD5:%.*]] = add nuw nsw i32 [[MUL]], 2
; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds i32, ptr [[DATA]], i32 [[ADD5]]
; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP9]]
; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX6]], align 4
; CHECK-NEXT: [[ADD7:%.*]] = add nsw i32 5, [[TMP8]]
; CHECK-NEXT: [[ARRAYIDX9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i32 [[I_023]]
; CHECK-NEXT: store i32 [[ADD7]], ptr [[ARRAYIDX9]], align 4
; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_023]], 1
Expand Down
46 changes: 22 additions & 24 deletions llvm/test/Transforms/LoopVectorize/RISCV/strided-accesses.ll
Original file line number Diff line number Diff line change
Expand Up @@ -233,17 +233,16 @@ define void @single_stride_int_scaled(ptr %p, i64 %stride) {
; NOSTRIDED: vector.body:
; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[TMP5:%.*]] = add i64 [[INDEX]], 0
; NOSTRIDED-NEXT: [[TMP6:%.*]] = mul nuw nsw i64 [[TMP5]], 1
; NOSTRIDED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP6]]
; NOSTRIDED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP7]], i32 0
; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP8]], align 4
; NOSTRIDED-NEXT: [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; NOSTRIDED-NEXT: store <vscale x 4 x i32> [[TMP9]], ptr [[TMP8]], align 4
; NOSTRIDED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 4
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
; NOSTRIDED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; NOSTRIDED-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP5]]
; NOSTRIDED-NEXT: [[TMP7:%.*]] = getelementptr i32, ptr [[TMP6]], i32 0
; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP7]], align 4
; NOSTRIDED-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; NOSTRIDED-NEXT: store <vscale x 4 x i32> [[TMP8]], ptr [[TMP7]], align 4
; NOSTRIDED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul i64 [[TMP9]], 4
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
; NOSTRIDED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
Expand Down Expand Up @@ -450,19 +449,18 @@ define void @double_stride_int_scaled(ptr %p, ptr %p2, i64 %stride) {
; NOSTRIDED: vector.body:
; NOSTRIDED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
; NOSTRIDED-NEXT: [[TMP9:%.*]] = add i64 [[INDEX]], 0
; NOSTRIDED-NEXT: [[TMP10:%.*]] = mul nuw nsw i64 [[TMP9]], 1
; NOSTRIDED-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP10]]
; NOSTRIDED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0
; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP12]], align 4
; NOSTRIDED-NEXT: [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; NOSTRIDED-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP10]]
; NOSTRIDED-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP14]], i32 0
; NOSTRIDED-NEXT: store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
; NOSTRIDED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
; NOSTRIDED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; NOSTRIDED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP9]]
; NOSTRIDED-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
; NOSTRIDED-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
; NOSTRIDED-NEXT: [[TMP12:%.*]] = add <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
; NOSTRIDED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[P2]], i64 [[TMP9]]
; NOSTRIDED-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0
; NOSTRIDED-NEXT: store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
; NOSTRIDED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
; NOSTRIDED-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 4
; NOSTRIDED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
; NOSTRIDED-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
; NOSTRIDED: middle.block:
; NOSTRIDED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
; NOSTRIDED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,17 @@ define void @test(ptr %A, i32 %x) {
; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[OFFSET_IDX]], 0
; CHECK-NEXT: [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 1
; CHECK-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
; CHECK-NEXT: [[TMP5:%.*]] = mul i32 [[TMP4]], 1
; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP6]]
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP7]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP8]], align 4
; CHECK-NEXT: [[TMP9:%.*]] = mul i32 [[TMP1]], 1
; CHECK-NEXT: [[TMP10:%.*]] = zext i32 [[TMP9]] to i64
; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP11]], i32 0
; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], ptr [[TMP12]], align 4
; CHECK-NEXT: [[TMP5:%.*]] = zext i32 [[TMP4]] to i64
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP5]]
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i32 0
; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP1]] to i64
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]]
; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0
; CHECK-NEXT: store <4 x float> [[WIDE_LOAD]], ptr [[TMP10]], align 4
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef
; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
; CHECK: middle.block:
; CHECK-NEXT: br i1 undef, label [[EXIT:%.*]], label [[SCALAR_PH]]
; CHECK: scalar.ph:
Expand Down

0 comments on commit f108c6c

Please sign in to comment.