diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 091d94843698c..aadc778333635 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1644,7 +1644,7 @@ void InterleavedAccessInfo::analyzeInterleaving( Value *MemberPtr = getLoadStorePointerOperand(Member); Type *AccessTy = getLoadStoreType(Member); if (getPtrStride(PSE, AccessTy, MemberPtr, TheLoop, Strides, - /*Assume=*/false, /*ShouldCheckWrap=*/true).value_or(0)) + /*Assume=*/true, /*ShouldCheckWrap=*/true)) return false; LLVM_DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to " << FirstOrLast @@ -1657,10 +1657,6 @@ void InterleavedAccessInfo::analyzeInterleaving( // accesses may wrap around. We have to revisit the getPtrStride analysis, // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does // not check wrapping (see documentation there). - // FORNOW we use Assume=false; - // TODO: Change to Assume=true but making sure we don't exceed the threshold - // of runtime SCEV assumptions checks (thereby potentially failing to - // vectorize altogether). // Additional optional optimizations: // TODO: If we are peeling the loop and we know that the first pointer doesn't // wrap then we can deduce that all pointers in the group don't wrap. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll index 9ebe79096adc4..ba45c4dd64b17 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -154,6 +154,24 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 2 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK: vector.scevcheck: +; CHECK-NEXT: [[UMAX1:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 4) +; CHECK-NEXT: [[TMP14:%.*]] = add i64 [[UMAX1]], -1 +; CHECK-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 2 +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP16]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult ptr [[TMP5]], [[SRC]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]] +; CHECK-NEXT: [[MUL1:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 16, i64 [[TMP16]]) +; CHECK-NEXT: [[MUL_RESULT2:%.*]] = extractvalue { i64, i1 } [[MUL1]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW3:%.*]] = extractvalue { i64, i1 } [[MUL1]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[MUL_RESULT2]] +; CHECK-NEXT: [[TMP23:%.*]] = icmp ult ptr [[TMP17]], [[DST]] +; CHECK-NEXT: [[TMP24:%.*]] = or i1 [[TMP23]], [[MUL_OVERFLOW3]] +; CHECK-NEXT: [[TMP25:%.*]] = or i1 [[TMP7]], [[TMP24]] +; CHECK-NEXT: br i1 [[TMP25]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH1:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 @@ -171,12 +189,12 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[SRC:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH1]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH1]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH1]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr i32, ptr [[SRC]], [[VEC_IND]] ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4i32.nxv4p0( [[TMP19]], i32 4, [[ACTIVE_LANE_MASK]], poison) -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST:%.*]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i32, ptr [[DST]], [[VEC_IND]] ; CHECK-NEXT: call void @llvm.masked.scatter.nxv4i32.nxv4p0( [[WIDE_MASKED_GATHER]], [[TMP20]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[INDEX_NEXT2]] = add i64 [[INDEX1]], [[TMP4]] ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP12]]) @@ -186,8 +204,7 @@ define void @copy_stride4(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] -; CHECK: while.end.loopexit: -; CHECK-NEXT: ret void +; CHECK: scalar.ph: ; entry: br label %while.body @@ -235,7 +252,7 @@ define void @simple_gather_scatter(ptr noalias %dst, ptr noalias %src, ptr noali ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: while.end.loopexit: @@ -289,7 +306,7 @@ define void @uniform_load(ptr noalias %dst, ptr noalias readonly %src, i64 %n) # ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = xor i1 [[TMP14]], true -; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: @@ -348,7 +365,7 @@ define void @cond_uniform_load(ptr noalias %dst, ptr noalias readonly %src, ptr ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP17:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP18:%.*]] = xor i1 [[TMP17]], true -; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: @@ -410,7 +427,7 @@ define void @uniform_store(ptr noalias %dst, ptr noalias readonly %src, i64 %n) ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP12:%.*]] = xor i1 [[TMP13]], true -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[FOR_END:%.*]] ; CHECK: for.end: @@ -462,7 +479,7 @@ define void @simple_fdiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP13:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP14:%.*]] = xor i1 [[TMP13]], true -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: while.end.loopexit: @@ -518,7 +535,7 @@ define void @simple_idiv(ptr noalias %dst, ptr noalias %src, i64 %n) #0 { ; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX1]], i64 [[TMP9]]) ; CHECK-NEXT: [[TMP14:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-NEXT: [[TMP17:%.*]] = xor i1 [[TMP14]], true -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br label [[WHILE_END_LOOPEXIT:%.*]] ; CHECK: while.end.loopexit: @@ -561,7 +578,7 @@ define void @simple_memset_trip1024(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP3]] ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll index 10f8f742bb1e2..10f354343c553 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/dead-ops-cost.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 ; RUN: opt -p loop-vectorize -mtriple riscv64-linux-gnu -mattr=+v,+f -S %s | FileCheck %s target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" @@ -351,7 +351,18 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: [[TMP0:%.*]] = add nuw i64 [[SMAX]], 1 ; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-NEXT: br label %[[VECTOR_SCEVCHECK:.*]] +; CHECK: [[VECTOR_SCEVCHECK]]: +; CHECK-NEXT: [[SMAX1:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[SMAX1]], 1 +; CHECK-NEXT: [[TMP4:%.*]] = lshr i64 [[TMP3]], 1 +; CHECK-NEXT: [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 8, i64 [[TMP4]]) +; CHECK-NEXT: [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0 +; CHECK-NEXT: [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[MUL_RESULT]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp ult ptr [[TMP5]], [[DST]] +; CHECK-NEXT: [[TMP7:%.*]] = or i1 [[TMP6]], [[MUL_OVERFLOW]] +; CHECK-NEXT: br i1 [[TMP7]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; CHECK: [[VECTOR_PH]]: ; CHECK-NEXT: [[TMP9:%.*]] = call @llvm.stepvector.nxv4i64() ; CHECK-NEXT: [[TMP11:%.*]] = mul [[TMP9]], splat (i64 2) @@ -383,6 +394,21 @@ define void @gather_interleave_group_with_dead_insert_pos(i64 %N, ptr noalias %s ; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: ; CHECK-NEXT: br label %[[EXIT:.*]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[LOOP]] ], [ 0, %[[SCALAR_PH]] ] +; CHECK-NEXT: [[GEP_SRC_0:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV]] +; CHECK-NEXT: [[L_DEAD:%.*]] = load i8, ptr [[GEP_SRC_0]], align 1 +; CHECK-NEXT: [[IV_1:%.*]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[IV_1]] +; CHECK-NEXT: [[L_1:%.*]] = load i8, ptr [[GEP_SRC_1]], align 1 +; CHECK-NEXT: [[EXT:%.*]] = zext i8 [[L_1]] to i32 +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr i32, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i32 [[EXT]], ptr [[GEP_DST]], align 4 +; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], 2 +; CHECK-NEXT: [[EC:%.*]] = icmp slt i64 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK: [[EXIT]]: ; CHECK-NEXT: ret void ; @@ -408,25 +434,3 @@ exit: } attributes #0 = { "target-features"="+64bit,+v" } - -;. -; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} -; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} -; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; CHECK: [[META3]] = !{[[META4:![0-9]+]]} -; CHECK: [[META4]] = distinct !{[[META4]], [[META5:![0-9]+]]} -; CHECK: [[META5]] = distinct !{[[META5]], !"LVerDomain"} -; CHECK: [[META6]] = !{[[META7:![0-9]+]]} -; CHECK: [[META7]] = distinct !{[[META7]], [[META5]]} -; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]} -; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]]} -; CHECK: [[META10]] = !{[[META11:![0-9]+]]} -; CHECK: [[META11]] = distinct !{[[META11]], [[META12:![0-9]+]]} -; CHECK: [[META12]] = distinct !{[[META12]], !"LVerDomain"} -; CHECK: [[META13]] = !{[[META14:![0-9]+]]} -; CHECK: [[META14]] = distinct !{[[META14]], [[META12]]} -; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]} -; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]]} -; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} -; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} -;. diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll index 2f9627855a2c9..5cfb7859c0e02 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleave-cost.ll @@ -351,7 +351,6 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) ; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] ; CHECK: [[VECTOR_BODY]]: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i64> [ , %[[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VECTOR_BODY]] ] ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP51:%.*]] = lshr exact i64 [[OFFSET_IDX]], 1 ; CHECK-NEXT: [[TMP52:%.*]] = getelementptr nusw i32, ptr [[B]], i64 [[TMP51]] @@ -359,8 +358,9 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC34:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP56:%.*]] = getelementptr i32, ptr [[A]], i64 [[OFFSET_IDX]] -; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], <4 x i64> [[VEC_IND]] -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP54]], i32 4, <4 x i1> splat (i1 true), <4 x i32> poison), !alias.scope [[META15:![0-9]+]], !noalias [[META13]] +; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[B]], i64 [[OFFSET_IDX]] +; CHECK-NEXT: [[WIDE_VEC42:%.*]] = load <32 x i32>, ptr [[TMP54]], align 4, !alias.scope [[META15:![0-9]+]], !noalias [[META13]] +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = shufflevector <32 x i32> [[WIDE_VEC42]], <32 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP58:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> zeroinitializer, <8 x i32> ; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC34]], <4 x i32> zeroinitializer, <8 x i32> ; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <4 x i32> [[WIDE_MASKED_GATHER]], <4 x i32> zeroinitializer, <8 x i32> @@ -370,7 +370,6 @@ define void @geps_feeding_interleave_groups_with_reuse2(ptr %A, ptr %B, i64 %N) ; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <32 x i32> [[TMP63]], <32 x i32> poison, <32 x i32> ; CHECK-NEXT: store <32 x i32> [[INTERLEAVED_VEC]], ptr [[TMP56]], align 4, !alias.scope [[META13]] ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 32) ; CHECK-NEXT: [[TMP64:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP64]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK: [[MIDDLE_BLOCK]]: