diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e62d57e6920b7..8856857e06964 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7910,6 +7910,29 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) { (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB))) ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second); } + + // Check that all partial reductions in a chain are only used by other partial + // reductions with the same scale factor. Otherwise we end up creating users + // of scaled reductions where the types of the other operands don't match. + auto AllUsersPartialRdx = [this](Instruction *I, unsigned Scale) { + return all_of(I->users(), [Scale, this](const User *U) { + auto *UI = cast(U); + + if (isa(UI) && UI->getParent() == OrigLoop->getHeader()) { + return all_of(UI->users(), [Scale, this](const User *U) { + auto *UI = cast(U); + return ScaledReductionMap.lookup_or(UI, 0) == Scale; + }); + } + + return ScaledReductionMap.lookup_or(UI, 0) == Scale || + !OrigLoop->contains(UI->getParent()); + }); + }; + for (const auto &[Chain, Scale] : PartialReductionChains) { + if (!AllUsersPartialRdx(Chain.Reduction, Scale)) + ScaledReductionMap.erase(Chain.Reduction); + } } bool VPRecipeBuilder::getScaledReductions( @@ -8093,11 +8116,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R, if (isa(Instr) || isa(Instr)) return tryToWidenMemory(Instr, Operands, Range); - if (std::optional ScaleFactor = getScalingForReduction(Instr)) { - if (auto PartialRed = - tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value())) - return PartialRed; - } + if (std::optional ScaleFactor = getScalingForReduction(Instr)) + return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()); if (!shouldWiden(Instr, Range)) return nullptr; @@ -8131,9 +8151,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction, isa(BinOpRecipe)) std::swap(BinOp, Accumulator); - if (ScaleFactor != - vputils::getVFScaleFactor(Accumulator->getDefiningRecipe())) - return nullptr; + assert(ScaleFactor == + vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) && + "all accumulators in chain must have same scale factor"); unsigned ReductionOpcode = Reduction->getOpcode(); if (ReductionOpcode == Instruction::Sub) { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 5ae08393a1804..3dfa6df3313a5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -1361,132 +1361,6 @@ for.body: ; preds = %for.body.preheader, br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1 } -define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { -; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-NEON-NEXT: entry: -; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 -; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-NEON: vector.ph: -; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 -; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0 -; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer -; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-NEON: vector.body: -; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] -; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-NEON: middle.block: -; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) -; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-NEON: scalar.ph: -; -; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-SVE-NEXT: entry: -; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] -; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4 -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]] -; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[OFFSET]], i64 0 -; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-SVE: vector.body: -; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-SVE-NEXT: [[TMP8:%.*]] = add [[VEC_PHI]], [[TMP7]] -; CHECK-SVE-NEXT: [[TMP9]] = add [[TMP8]], [[BROADCAST_SPLAT]] -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP9]]) -; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-SVE: scalar.ph: -; -; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain( -; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { -; CHECK-SVE-MAXBW-NEXT: entry: -; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 -; CHECK-SVE-MAXBW-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 -; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] -; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3 -; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK-SVE-MAXBW: vector.ph: -; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8 -; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]] -; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] -; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[OFFSET]], i64 0 -; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK-SVE-MAXBW: vector.body: -; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] -; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 -; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add [[VEC_PHI]], [[TMP7]] -; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] -; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] -; CHECK-SVE-MAXBW: middle.block: -; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP8]]) -; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] -; CHECK-SVE-MAXBW: scalar.ph: -; -entry: - br label %loop - -loop: - %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] - %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] - %l = load i8, ptr %ptr.iv, align 1 - %l.ext = zext i8 %l to i32 - %add = add i32 %red, %l.ext - %red.next = add i32 %add, %offset - %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 - %ec = icmp eq ptr %ptr.iv, %end - br i1 %ec, label %exit, label %loop - -exit: - ret i32 %red.next -} - attributes #0 = { vscale_range(1,16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll new file mode 100644 index 0000000000000..5b5c40a83e11e --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4 +; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON + +target triple = "arm64-apple-macosx" + +define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) { +; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain( +; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64 +; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64 +; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1 +; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]] +; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16 +; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16 +; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]] +; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0 +; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]] +; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 +; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) +; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEON: scalar.ph: +; +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %add = add i32 %red, %l.ext + %red.next = add i32 %add, %offset + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + + +define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 { +; CHECK-NEON-LABEL: define i16 @test_incomplete_chain_without_mul( +; CHECK-NEON-SAME: ptr noalias [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEON-NEXT: entry: +; CHECK-NEON-NEXT: br label [[VECTOR_MEMCHECK:%.*]] +; CHECK-NEON: vector.ph: +; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEON: vector.body: +; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, [[VECTOR_MEMCHECK]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEON-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1 +; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0 +; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEON-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i16> +; CHECK-NEON-NEXT: [[TMP2:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 +; CHECK-NEON-NEXT: store i16 [[TMP2]], ptr [[DST]], align 2 +; CHECK-NEON-NEXT: [[TMP3:%.*]] = load i8, ptr [[B]], align 1 +; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[TMP3]], i64 0 +; CHECK-NEON-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEON-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT7]] to <16 x i16> +; CHECK-NEON-NEXT: [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP4]] +; CHECK-NEON-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP5]], [[TMP1]] +; CHECK-NEON-NEXT: [[TMP7]] = add <16 x i16> [[TMP6]], [[TMP4]] +; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-NEON-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEON-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEON: middle.block: +; CHECK-NEON-NEXT: [[TMP9:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]]) +; CHECK-NEON-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-NEON: scalar.ph: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %red = phi i16 [ 0, %entry ], [ %red.next, %loop ] + %l.a = load i8, ptr %A, align 1 + %a.ext = zext i8 %l.a to i16 + store i16 %a.ext, ptr %dst, align 2 + %l.b = load i8, ptr %B, align 1 + %b.ext = zext i8 %l.b to i16 + %add = add i16 %red, %b.ext + %add.1 = add i16 %add, %a.ext + %red.next = add i16 %add.1, %b.ext + %iv.next = add i64 %iv, 1 + %ec = icmp ult i64 %iv, 1024 + br i1 %ec, label %loop, label %exit + +exit: + ret i16 %red.next +} + +attributes #0 = { "target-cpu"="grace" }