diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 3ce9d29d34553..773c1559ec679 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4479,6 +4479,28 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( Type *TCType = Legal->getWidestInductionType(); const SCEV *RemainingIterations = nullptr; unsigned MaxTripCount = 0; + if (MainLoopVF.isFixed()) { + // TODO: extend to support scalable VFs. + const SCEV *TC = vputils::getSCEVExprForVPValue( + getPlanFor(MainLoopVF).getTripCount(), SE); + assert(!isa(TC) && + "Trip count SCEV must be computable"); + RemainingIterations = SE.getURemExpr( + TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); + + // No iterations left to process in the epilogue. + if (RemainingIterations->isZero()) + return Result; + + MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; + if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, + SE.getConstant(TCType, MaxTripCount))) { + MaxTripCount = SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); + } + LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " + << MaxTripCount << "\n"); + } + for (auto &NextVF : ProfitableVFs) { // Skip candidate VFs without a corresponding VPlan. if (!hasPlanWithVF(NextVF.Width)) @@ -4496,24 +4518,7 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( // If NextVF is greater than the number of remaining iterations, the // epilogue loop would be dead. Skip such factors. - if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) { - // TODO: extend to support scalable VFs. - if (!RemainingIterations) { - const SCEV *TC = vputils::getSCEVExprForVPValue( - getPlanFor(NextVF.Width).getTripCount(), SE); - assert(!isa(TC) && - "Trip count SCEV must be computable"); - RemainingIterations = SE.getURemExpr( - TC, SE.getConstant(TCType, MainLoopVF.getFixedValue() * IC)); - MaxTripCount = MainLoopVF.getFixedValue() * IC - 1; - if (SE.isKnownPredicate(CmpInst::ICMP_ULT, RemainingIterations, - SE.getConstant(TCType, MaxTripCount))) { - MaxTripCount = - SE.getUnsignedRangeMax(RemainingIterations).getZExtValue(); - } - LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: " - << MaxTripCount << "\n"); - } + if (RemainingIterations && !NextVF.Width.isScalable()) { if (SE.isKnownPredicate( CmpInst::ICMP_UGT, SE.getConstant(TCType, NextVF.Width.getFixedValue()), diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll index 1f619898ea788..812bca9841f85 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll @@ -46,27 +46,17 @@ define void @_Z3foov() { ; CHECK-V2-IC4-LABEL: define void @_Z3foov( ; CHECK-V2-IC4-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY1:.*:]] -; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]], !prof [[PROF0:![0-9]+]] -; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-V2-IC4: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0]] +; CHECK-V2-IC4: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]], !prof [[PROF0:![0-9]+]] ; CHECK-V2-IC4: [[VECTOR_PH]]: ; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]] ; CHECK-V2-IC4: [[VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP12:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-V2-IC4: br i1 [[TMP10:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK-V2-IC4: [[MIDDLE_BLOCK]]: -; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]], !prof [[PROF5:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_PH]]: -; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-V2-IC4: br i1 [[TMP23:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]], !prof [[PROF8:![0-9]+]] -; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-V2-IC4: br i1 true, label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]], !prof [[PROF5:![0-9]+]] +; CHECK-V2-IC4: [[SCALAR_PH]]: ; CHECK-V2-IC4: br label %[[FOR_BODY:.*]] ; CHECK-V2-IC4: [[FOR_BODY]]: -; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF9:![0-9]+]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]: ; entry: @@ -111,9 +101,6 @@ for.cond.cleanup: ; preds = %for.body ; CHECK-V2-IC4: [[META3]] = !{!"llvm.loop.isvectorized", i32 1} ; CHECK-V2-IC4: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"} ; CHECK-V2-IC4: [[PROF5]] = !{!"branch_weights", i32 1, i32 15} -; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 2, i32 0} -; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META3]], [[META4]]} -; CHECK-V2-IC4: [[PROF8]] = !{!"branch_weights", i32 1, i32 1} -; CHECK-V2-IC4: [[PROF9]] = !{!"branch_weights", i32 0, i32 0} -; CHECK-V2-IC4: [[LOOP10]] = distinct !{[[LOOP10]], [[META4]], [[META3]]} +; CHECK-V2-IC4: [[PROF6]] = !{!"branch_weights", i32 0, i32 0} +; CHECK-V2-IC4: [[LOOP7]] = distinct !{[[LOOP7]], [[META4]], [[META3]]} ;. diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll new file mode 100644 index 0000000000000..298ef09904b77 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -0,0 +1,399 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 +; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s + +target triple = "aarch64-linux-gnu" + +; Original loop has trip count 16, but contains interleave groups with gaps, so +; the last iteration must execute in the scalar loop. Thus the vector loop can +; only execute up to 15 iterations. +define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { +; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 0 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP9]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64> +; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = insertelement zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], splat (i64 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP26]] +; CHECK-NEXT: [[TMP27:%.*]] = mul i64 1, [[TMP21]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP27]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], [[VEC_IND]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i8.nxv2p0( [[TMP28]], i32 1, splat (i1 true), poison) +; CHECK-NEXT: [[TMP29:%.*]] = zext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.umin.nxv2i32( [[TMP23]], [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP33]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = zext [[TMP31]] to +; CHECK-NEXT: [[TMP35]] = or [[VEC_PHI8]], [[TMP34]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64( [[TMP35]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) +; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 +; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 + %l = load i8, ptr %gep.src.i.i, align 1 + %l.ext = zext i8 %l to i32 + %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) + %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 0, ptr %gep.dst, align 1 + %min.ext = zext i32 %min.1 to i64 + %red.next = or i64 %red, %min.ext + %iv.next = add i64 %iv, 1 + %exitcond.not.i.i = icmp eq i64 %iv.next, 17 + br i1 %exitcond.not.i.i, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Original loop has trip count 17, but contains interleave groups with gaps, so +; the last iteration must execute in the scalar loop. Thus the vector loop can +; only execute up to 16 iterations. +define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { +; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations( +; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ITER_CHECK:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 17, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; CHECK-NEXT: br i1 false, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]]) +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0 +; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP27]], align 1 +; CHECK-NEXT: [[TMP10:%.*]] = zext <16 x i32> [[TMP7]] to <16 x i64> +; CHECK-NEXT: [[TMP11]] = or <16 x i64> [[VEC_PHI]], [[TMP10]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP11]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: +; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 2 +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 1, [[TMP15]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 17, [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: [[TMP19:%.*]] = select i1 [[TMP18]], i64 [[TMP17]], i64 [[N_MOD_VF]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 17, [[TMP19]] +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul nuw i64 [[TMP20]], 2 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[X]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer +; CHECK-NEXT: [[TMP22:%.*]] = insertelement zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 +; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) +; CHECK-NEXT: [[TMP25:%.*]] = call @llvm.stepvector.nxv2i64() +; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement poison, i64 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector [[BROADCAST_SPLATINSERT3]], poison, zeroinitializer +; CHECK-NEXT: [[TMP38:%.*]] = mul [[TMP25]], splat (i64 1) +; CHECK-NEXT: [[INDUCTION:%.*]] = add [[BROADCAST_SPLAT4]], [[TMP38]] +; CHECK-NEXT: [[TMP39:%.*]] = mul i64 1, [[TMP21]] +; CHECK-NEXT: [[BROADCAST_SPLATINSERT5:%.*]] = insertelement poison, i64 [[TMP39]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT6:%.*]] = shufflevector [[BROADCAST_SPLATINSERT5]], poison, zeroinitializer +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI8:%.*]] = phi [ [[TMP22]], %[[VEC_EPILOG_PH]] ], [ [[TMP35:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], [[VEC_IND]], i32 0, i64 3 +; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i8.nxv2p0( [[TMP28]], i32 1, splat (i1 true), poison) +; CHECK-NEXT: [[TMP29:%.*]] = zext [[WIDE_MASKED_GATHER]] to +; CHECK-NEXT: [[TMP30:%.*]] = call @llvm.umin.nxv2i32( [[TMP23]], [[TMP29]]) +; CHECK-NEXT: [[TMP31:%.*]] = call @llvm.umin.nxv2i32( [[TMP24]], [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]] +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[TMP32]], i32 0 +; CHECK-NEXT: store zeroinitializer, ptr [[TMP33]], align 1 +; CHECK-NEXT: [[TMP34:%.*]] = zext [[TMP31]] to +; CHECK-NEXT: [[TMP35]] = or [[VEC_PHI8]], [[TMP34]] +; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], [[TMP21]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[BROADCAST_SPLAT6]] +; CHECK-NEXT: [[TMP36:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP36]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[TMP37:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64( [[TMP35]]) +; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL10:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 16, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: [[BC_MERGE_RDX11:%.*]] = phi i64 [ [[TMP37]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP13]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL10]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX11]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 +; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 +; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 +; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) +; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) +; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) +; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] +; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 +; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 +; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 17 +; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] +; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 + %l = load i8, ptr %gep.src.i.i, align 1 + %l.ext = zext i8 %l to i32 + %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) + %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) + %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) + %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv + store i8 0, ptr %gep.dst, align 1 + %min.ext = zext i32 %min.1 to i64 + %red.next = or i64 %red, %min.ext + %iv.next = add i64 %iv, 1 + %exitcond.not.i.i = icmp eq i64 %iv.next, 17 + br i1 %exitcond.not.i.i, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Test case for https://github.com/llvm/llvm-project/issues/149726. +define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks(ptr noalias %A, ptr noalias %B, ptr noalias %C, ptr noalias %D, ptr noalias %E, ptr noalias %F, ptr noalias %G, ptr noalias %H, ptr noalias %I, ptr noalias %J, ptr noalias %K, ptr %L) #1 { +; CHECK-LABEL: define void @main_vector_loop_fixed_single_vector_iteration_with_runtime_checks( +; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], ptr noalias [[C:%.*]], ptr noalias [[D:%.*]], ptr noalias [[E:%.*]], ptr noalias [[F:%.*]], ptr noalias [[G:%.*]], ptr noalias [[H:%.*]], ptr noalias [[I:%.*]], ptr noalias [[J:%.*]], ptr noalias [[K:%.*]], ptr [[L:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*]]: +; CHECK-NEXT: br i1 true, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[J]], i64 0 +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = trunc <8 x i64> [[STRIDED_VEC]] to <8 x i16> +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[K]], i64 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i16, ptr [[K]], i64 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[K]], i64 4 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i16, ptr [[K]], i64 6 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[K]], i64 8 +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[K]], i64 10 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[K]], i64 12 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i16, ptr [[K]], i64 14 +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; CHECK-NEXT: store i16 [[TMP14]], ptr [[TMP6]], align 2 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; CHECK-NEXT: store i16 [[TMP15]], ptr [[TMP7]], align 2 +; CHECK-NEXT: [[TMP16:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; CHECK-NEXT: store i16 [[TMP16]], ptr [[TMP8]], align 2 +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; CHECK-NEXT: store i16 [[TMP17]], ptr [[TMP9]], align 2 +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; CHECK-NEXT: store i16 [[TMP18]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; CHECK-NEXT: store i16 [[TMP19]], ptr [[TMP11]], align 2 +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; CHECK-NEXT: store i16 [[TMP20]], ptr [[TMP12]], align 2 +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; CHECK-NEXT: store i16 [[TMP21]], ptr [[TMP13]], align 2 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[B]], align 8 +; CHECK-NEXT: store i64 0, ptr [[C]], align 8 +; CHECK-NEXT: store i64 0, ptr [[D]], align 8 +; CHECK-NEXT: store i64 0, ptr [[E]], align 8 +; CHECK-NEXT: store i64 0, ptr [[F]], align 8 +; CHECK-NEXT: store i64 0, ptr [[G]], align 8 +; CHECK-NEXT: store i64 0, ptr [[H]], align 8 +; CHECK-NEXT: store i64 0, ptr [[I]], align 8 +; CHECK-NEXT: store i64 0, ptr [[L]], align 8 +; CHECK-NEXT: br label %[[MIDDLE_BLOCK:.*]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[SCALAR_PH]] +; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP:.*]] +; CHECK: [[LOOP]]: +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] +; CHECK-NEXT: [[GEP_J:%.*]] = getelementptr i64, ptr [[J]], i64 [[IV]] +; CHECK-NEXT: [[L_J:%.*]] = load i64, ptr [[GEP_J]], align 8 +; CHECK-NEXT: [[L_TRUNC:%.*]] = trunc i64 [[L_J]] to i16 +; CHECK-NEXT: [[GEP_K:%.*]] = getelementptr i16, ptr [[K]], i64 [[IV]] +; CHECK-NEXT: store i16 [[L_TRUNC]], ptr [[GEP_K]], align 2 +; CHECK-NEXT: store i64 0, ptr [[A]], align 8 +; CHECK-NEXT: store i64 0, ptr [[B]], align 8 +; CHECK-NEXT: store i64 0, ptr [[C]], align 8 +; CHECK-NEXT: store i64 0, ptr [[D]], align 8 +; CHECK-NEXT: store i64 0, ptr [[E]], align 8 +; CHECK-NEXT: store i64 0, ptr [[F]], align 8 +; CHECK-NEXT: store i64 0, ptr [[G]], align 8 +; CHECK-NEXT: store i64 0, ptr [[H]], align 8 +; CHECK-NEXT: store i64 0, ptr [[I]], align 8 +; CHECK-NEXT: store i64 0, ptr [[L]], align 8 +; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 2 +; CHECK-NEXT: [[EC:%.*]] = icmp ult i64 [[IV]], 14 +; CHECK-NEXT: br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %gep.J = getelementptr i64, ptr %J, i64 %iv + %l.J = load i64, ptr %gep.J, align 8 + %l.trunc = trunc i64 %l.J to i16 + %gep.K = getelementptr i16, ptr %K, i64 %iv + store i16 %l.trunc, ptr %gep.K, align 2 + store i64 0, ptr %A, align 8 + store i64 0, ptr %B, align 8 + store i64 0, ptr %C, align 8 + store i64 0, ptr %D, align 8 + store i64 0, ptr %E, align 8 + store i64 0, ptr %F, align 8 + store i64 0, ptr %G, align 8 + store i64 0, ptr %H, align 8 + store i64 0, ptr %I, align 8 + store i64 0, ptr %L, align 8 + %iv.next = add i64 %iv, 2 + %ec = icmp ult i64 %iv, 14 + br i1 %ec, label %loop, label %exit, !llvm.loop !0 + +exit: + ret void +} + +declare i32 @llvm.umin.i32(i32, i32) + +declare i32 @llvm.abs.i32(i32, i1 immarg) + +attributes #0 = { "target-cpu"="neoverse-512tvb" } +attributes #1 = { "target-cpu"="grace" } + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.vectorize.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 400b031917a72..7090ae83be868 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -7,11 +7,7 @@ target triple = "aarch64-none-unknown-elf" define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-LABEL: define i32 @dotp( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: iter.check: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: entry: ; CHECK-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -33,64 +29,8 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] -; CHECK: vec.epilog.iter.check: -; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP13:%.*]] = mul nuw i64 [[TMP12]], 4 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP13]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]] -; CHECK: vec.epilog.ph: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP15]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 4 -; CHECK-NEXT: [[TMP18:%.*]] = insertelement zeroinitializer, i32 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi [ [[TMP18]], [[VEC_EPILOG_PH]] ], [ [[TMP27:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX2]] -; CHECK-NEXT: [[TMP21:%.*]] = getelementptr i8, ptr [[TMP20]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP21]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = zext [[WIDE_LOAD4]] to -; CHECK-NEXT: [[TMP23:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX2]] -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[TMP23]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP24]], align 1 -; CHECK-NEXT: [[TMP25:%.*]] = zext [[WIDE_LOAD5]] to -; CHECK-NEXT: [[TMP26:%.*]] = mul [[TMP25]], [[TMP22]] -; CHECK-NEXT: [[TMP27]] = add [[TMP26]], [[VEC_PHI3]] -; CHECK-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX2]], [[TMP17]] -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: vec.epilog.middle.block: -; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP27]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT]], label [[VEC_EPILOG_SCALAR_PH]] -; CHECK: vec.epilog.scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 1024, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] -; CHECK-NEXT: [[BC_MERGE_RDX7:%.*]] = phi i32 [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ] -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[ACCUM:%.*]] = phi i32 [ [[BC_MERGE_RDX7]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[ADD:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_A:%.*]] = load i8, ptr [[GEP_A]], align 1 -; CHECK-NEXT: [[EXT_A:%.*]] = zext i8 [[LOAD_A]] to i32 -; CHECK-NEXT: [[GEP_B:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]] -; CHECK-NEXT: [[LOAD_B:%.*]] = load i8, ptr [[GEP_B]], align 1 -; CHECK-NEXT: [[EXT_B:%.*]] = zext i8 [[LOAD_B]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[EXT_B]], [[EXT_A]] -; CHECK-NEXT: [[ADD]] = add i32 [[MUL]], [[ACCUM]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: for.exit: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP29]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i32 [[ADD_LCSSA]] +; CHECK-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_PH]] +; CHECK: scalar.ph: ; entry: br label %for.body @@ -142,7 +82,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.experimental.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]] -; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]] @@ -174,7 +114,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]] ; CHECK-NEXT: [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]] -; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]]) ; CHECK-NEXT: [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]] @@ -198,7 +138,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 { ; CHECK-NEXT: [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0 ; CHECK-NEXT: [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1 ; CHECK-NEXT: [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]] -; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: while.end.loopexit: ; CHECK-NEXT: [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ] ; CHECK-NEXT: ret void @@ -557,7 +497,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) { ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 16 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16) ; CHECK-NEXT: [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) ; CHECK-NEXT: br label [[EXIT:%.*]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll deleted file mode 100644 index d85bc484af0b0..0000000000000 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-no-remaining-iterations.ll +++ /dev/null @@ -1,146 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 5 -; RUN: opt -passes=loop-vectorize -S %s | FileCheck %s - -target triple = "aarch64-linux-gnu" - -define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { -; CHECK-LABEL: define i64 @main_vector_loop_fixed_with_no_remaining_iterations( -; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[ITER_CHECK:.*]]: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP3]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: -; CHECK-NEXT: br i1 true, label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP0:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false) -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP17:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[INDEX]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_VEC2:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1 -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <64 x i8> [[WIDE_VEC2]], <64 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[STRIDED_VEC3]] to <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP0]], <16 x i32> [[TMP6]]) -; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP1]], <16 x i32> [[TMP8]]) -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP12]], align 1 -; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i32> [[TMP10]] to <16 x i64> -; CHECK-NEXT: [[TMP17]] = or <16 x i64> [[VEC_PHI1]], [[TMP15]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: br i1 true, label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.or.v16i64(<16 x i64> [[TMP17]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_ITER_CHECK:.*]] -; CHECK: [[VEC_EPILOG_ITER_CHECK]]: -; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 2 -; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ule i64 16, [[TMP14]] -; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]] -; CHECK: [[VEC_EPILOG_PH]]: -; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; CHECK-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP16:%.*]] = mul nuw i64 [[TMP31]], 2 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 16, [[TMP16]] -; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 -; CHECK-NEXT: [[TMP36:%.*]] = select i1 [[TMP32]], i64 [[TMP16]], i64 [[N_MOD_VF]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 16, [[TMP36]] -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP20:%.*]] = mul nuw i64 [[TMP19]], 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement poison, i32 [[X]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector [[BROADCAST_SPLATINSERT1]], poison, zeroinitializer -; CHECK-NEXT: [[TMP21:%.*]] = insertelement zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0 -; CHECK-NEXT: [[TMP22:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP23:%.*]] = call @llvm.abs.nxv2i32( [[BROADCAST_SPLAT2]], i1 false) -; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.stepvector.nxv2i64() -; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i64 [[VEC_EPILOG_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer -; CHECK-NEXT: [[TMP25:%.*]] = mul [[TMP24]], splat (i64 1) -; CHECK-NEXT: [[INDUCTION:%.*]] = add [[DOTSPLAT]], [[TMP25]] -; CHECK-NEXT: [[TMP37:%.*]] = mul i64 1, [[TMP20]] -; CHECK-NEXT: [[DOTSPLATINSERT4:%.*]] = insertelement poison, i64 [[TMP37]], i64 0 -; CHECK-NEXT: [[DOTSPLAT5:%.*]] = shufflevector [[DOTSPLATINSERT4]], poison, zeroinitializer -; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT10:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], %[[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi [ [[TMP21]], %[[VEC_EPILOG_PH]] ], [ [[TMP34:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP38:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], [[VEC_IND]], i32 0, i64 3 -; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv2i8.nxv2p0( [[TMP38]], i32 1, splat (i1 true), poison) -; CHECK-NEXT: [[TMP28:%.*]] = zext [[WIDE_MASKED_GATHER]] to -; CHECK-NEXT: [[TMP29:%.*]] = call @llvm.umin.nxv2i32( [[TMP22]], [[TMP28]]) -; CHECK-NEXT: [[TMP39:%.*]] = call @llvm.umin.nxv2i32( [[TMP23]], [[TMP29]]) -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX6]] -; CHECK-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP26]], i32 0 -; CHECK-NEXT: store zeroinitializer, ptr [[TMP27]], align 1 -; CHECK-NEXT: [[TMP33:%.*]] = zext [[TMP39]] to -; CHECK-NEXT: [[TMP34]] = or [[VEC_PHI6]], [[TMP33]] -; CHECK-NEXT: [[INDEX_NEXT10]] = add nuw i64 [[INDEX6]], [[TMP20]] -; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT5]] -; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT10]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] -; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vector.reduce.or.nxv2i64( [[TMP34]]) -; CHECK-NEXT: br label %[[VEC_EPILOG_SCALAR_PH]] -; CHECK: [[VEC_EPILOG_SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL8:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: [[BC_MERGE_RDX9:%.*]] = phi i64 [ [[TMP30]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP18]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL8]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX9]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[GEP_SRC_I_I:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], i64 [[IV]], i32 0, i64 3 -; CHECK-NEXT: [[L:%.*]] = load i8, ptr [[GEP_SRC_I_I]], align 1 -; CHECK-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32 -; CHECK-NEXT: [[ABS_0:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) -; CHECK-NEXT: [[MIN_0:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_0]], i32 [[L_EXT]]) -; CHECK-NEXT: [[ABS_1:%.*]] = call i32 @llvm.abs.i32(i32 [[X]], i1 false) -; CHECK-NEXT: [[MIN_1:%.*]] = call i32 @llvm.umin.i32(i32 [[ABS_1]], i32 [[MIN_0]]) -; CHECK-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[IV]] -; CHECK-NEXT: store i8 0, ptr [[GEP_DST]], align 1 -; CHECK-NEXT: [[MIN_EXT:%.*]] = zext i32 [[MIN_1]] to i64 -; CHECK-NEXT: [[RED_NEXT]] = or i64 [[RED]], [[MIN_EXT]] -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT_I_I:%.*]] = icmp eq i64 [[IV_NEXT]], 16 -; CHECK-NEXT: br i1 [[EXITCOND_NOT_I_I]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ] -; CHECK-NEXT: ret i64 [[RED_NEXT_LCSSA]] -; -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] - %gep.src.i.i = getelementptr { [4 x i8] }, ptr %src, i64 %iv, i32 0, i64 3 - %l = load i8, ptr %gep.src.i.i, align 1 - %l.ext = zext i8 %l to i32 - %abs.0 = call i32 @llvm.abs.i32(i32 %x, i1 false) - %min.0 = call i32 @llvm.umin.i32(i32 %abs.0, i32 %l.ext) - %abs.1 = call i32 @llvm.abs.i32(i32 %x, i1 false) - %min.1 = call i32 @llvm.umin.i32(i32 %abs.1, i32 %min.0) - %gep.dst = getelementptr inbounds i8, ptr %dst, i64 %iv - store i8 0, ptr %gep.dst, align 1 - %min.ext = zext i32 %min.1 to i64 - %red.next = or i64 %red, %min.ext - %iv.next = add i64 %iv, 1 - %exitcond.not.i.i = icmp eq i64 %iv.next, 16 - br i1 %exitcond.not.i.i, label %exit, label %loop - -exit: - ret i64 %red.next -} - -declare i32 @llvm.umin.i32(i32, i32) - -declare i32 @llvm.abs.i32(i32, i1 immarg) - -attributes #0 = { "target-cpu"="neoverse-512tvb" }