diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 1c53c606b271b..5ab7502429e95 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8072,6 +8072,19 @@ bool VPRecipeBuilder::getScaledReductions( if (Op == PHI) std::swap(Op, PhiOp); + using namespace llvm::PatternMatch; + // If Op is an extend, then it's still a valid partial reduction if the + // extended mul fulfills the other requirements. + // For example, reduce.add(ext(mul(ext(A), ext(B)))) is still a valid partial + // reduction since the inner extends will be widened. We already have oneUse + // checks on the inner extends so widening them is safe. + std::optional OuterExtKind = std::nullopt; + if (match(Op, m_ZExtOrSExt(m_Mul(m_Value(), m_Value())))) { + auto *Cast = cast(Op); + OuterExtKind = TTI::getPartialReductionExtendKind(Cast->getOpcode()); + Op = Cast->getOperand(0); + } + // Try and get a scaled reduction from the first non-phi operand. // If one is found, we use the discovered reduction instruction in // place of the accumulator for costing. @@ -8088,8 +8101,6 @@ bool VPRecipeBuilder::getScaledReductions( if (PhiOp != PHI) return false; - using namespace llvm::PatternMatch; - // If the update is a binary operator, check both of its operands to see if // they are extends. Otherwise, see if the update comes directly from an // extend. @@ -8099,7 +8110,7 @@ bool VPRecipeBuilder::getScaledReductions( Type *ExtOpTypes[2] = {nullptr}; TTI::PartialReductionExtendKind ExtKinds[2] = {TTI::PR_None}; - auto CollectExtInfo = [this, &Exts, &ExtOpTypes, + auto CollectExtInfo = [this, OuterExtKind, &Exts, &ExtOpTypes, &ExtKinds](SmallVectorImpl &Ops) -> bool { for (const auto &[I, OpI] : enumerate(Ops)) { const APInt *C; @@ -8120,6 +8131,10 @@ bool VPRecipeBuilder::getScaledReductions( ExtOpTypes[I] = ExtOp->getType(); ExtKinds[I] = TTI::getPartialReductionExtendKind(Exts[I]); + // The outer extend kind must be the same as the inner extends, so that + // they can be folded together. + if (OuterExtKind.has_value() && OuterExtKind.value() != ExtKinds[I]) + return false; } return true; }; diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll index b430efc9e5283..b033f6051f812 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll @@ -467,83 +467,3 @@ loop: exit: ret i32 %red.next } - -define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) { -; CHECK-LABEL: define i64 @partial_reduction_mul_two_users( -; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] -; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 -; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64> -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) -; CHECK-NEXT: [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64> -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7 -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ] -; CHECK-NEXT: br label %[[LOOP:.*]] -; CHECK: [[LOOP]]: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ] -; CHECK-NEXT: [[LOAD:%.*]] = load i16, ptr [[A]], align 2 -; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 -; CHECK-NEXT: [[CONV:%.*]] = sext i16 [[B]] to i32 -; CHECK-NEXT: [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]] -; CHECK-NEXT: [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64 -; CHECK-NEXT: [[ADD]] = add i64 [[RES2]], [[MUL_EXT]] -; CHECK-NEXT: [[OR:%.*]] = or i32 [[MUL]], [[C]] -; CHECK-NEXT: [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32 -; CHECK-NEXT: [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64 -; CHECK-NEXT: [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]] -; CHECK-NEXT: br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] -; CHECK: [[EXIT]]: -; CHECK-NEXT: [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ] -; CHECK-NEXT: ret i64 [[ADD_LCSSA]] -; -entry: - br label %loop - -loop: - %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] - %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] - %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] - %load = load i16, ptr %a, align 2 - %iv.next = add i64 %iv, 1 - %conv = sext i16 %b to i32 - %mul = mul i32 %conv, %conv - %mul.ext = zext i32 %mul to i64 - %add = add i64 %res2, %mul.ext - %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test - %load.ext = sext i16 %load to i32 - %load.ext.ext = sext i32 %load.ext to i64 - %exitcond740.not = icmp eq i64 %iv, %n - br i1 %exitcond740.not, label %exit, label %loop - -exit: - ret i64 %add -} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll index 6ead2a4eecbe8..c6c5c5105d540 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll @@ -1889,6 +1889,515 @@ for.exit: ; preds = %for.body ret i32 %result } +define i32 @dotp_ext_mul(i64 %n, ptr %a, i8 %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_ext_mul( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; +; CHECK-INTERLEAVED-LABEL: define i32 @dotp_ext_mul( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT3]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP3]]) +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT4]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; +; CHECK-MAXBW-LABEL: define i32 @dotp_ext_mul( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP2]], [[TMP2]] +; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP3]]) +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP5:%.*]] = sext <8 x i32> [[TMP4]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[PARTIAL_REDUCE]]) +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP5]], i32 7 +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i32 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = zext i8 %b to i16 + %mul = mul i16 %conv, %conv + %mul.ext = zext i16 %mul to i32 + %add = add i32 %res2, %mul.ext + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %add +} + +define i64 @not_dotp_ext_mul_8to64(i64 %n, ptr %a, i8 %b) { +; CHECK-INTERLEAVE1-LABEL: define i64 @not_dotp_ext_mul_8to64( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; +; CHECK-INTERLEAVED-LABEL: define i64 @not_dotp_ext_mul_8to64( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i64> [[VEC_PHI1]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i64> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; +; CHECK-MAXBW-LABEL: define i64 @not_dotp_ext_mul_8to64( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i64> +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i64> [[VEC_PHI]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = zext i8 %b to i16 + %mul = mul i16 %conv, %conv + %mul.ext = zext i16 %mul to i64 + %add = add i64 %res2, %mul.ext + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %add +} + +define i32 @not_dotp_sext_mul_zext(i64 %n, ptr %a, i8 %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_sext_mul_zext( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_sext_mul_zext( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_sext_mul_zext( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = zext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = sext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i32 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = zext i8 %b to i16 + %mul = mul i16 %conv, %conv + %mul.ext = sext i16 %mul to i32 + %add = add i32 %res2, %mul.ext + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %add +} + +define i32 @not_dotp_zext_mul_sext(i64 %n, ptr %a, i8 %b) { +; CHECK-INTERLEAVE1-LABEL: define i32 @not_dotp_zext_mul_sext( +; CHECK-INTERLEAVE1-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVE1-NEXT: entry: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVE1: vector.body: +; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVE1: middle.block: +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVE1: scalar.ph: +; +; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_zext_mul_sext( +; CHECK-INTERLEAVED-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-INTERLEAVED-NEXT: entry: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-INTERLEAVED: vector.body: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT2]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <8 x i32> [[VEC_PHI1]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT3]] to <8 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = sext <8 x i32> [[TMP7]] to <8 x i64> +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-INTERLEAVED: middle.block: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP6]], [[TMP5]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP8]], i32 7 +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-INTERLEAVED: scalar.ph: +; +; CHECK-MAXBW-LABEL: define i32 @not_dotp_zext_mul_sext( +; CHECK-MAXBW-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i8 [[B:%.*]]) #[[ATTR0]] { +; CHECK-MAXBW-NEXT: entry: +; CHECK-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[N]], 1 +; CHECK-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-MAXBW: vector.ph: +; CHECK-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8 +; CHECK-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i8> poison, i8 [[B]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i8> [[BROADCAST_SPLATINSERT]], <8 x i8> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP1:%.*]] = sext <8 x i8> [[BROADCAST_SPLAT]] to <8 x i16> +; CHECK-MAXBW-NEXT: [[TMP2:%.*]] = mul <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-MAXBW-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[TMP2]] to <8 x i32> +; CHECK-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-MAXBW: vector.body: +; CHECK-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-MAXBW-NEXT: [[TMP4:%.*]] = load i16, ptr [[A]], align 2 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0 +; CHECK-MAXBW-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer +; CHECK-MAXBW-NEXT: [[TMP5]] = add <8 x i32> [[VEC_PHI]], [[TMP3]] +; CHECK-MAXBW-NEXT: [[TMP6:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32> +; CHECK-MAXBW-NEXT: [[TMP7:%.*]] = sext <8 x i32> [[TMP6]] to <8 x i64> +; CHECK-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-MAXBW-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK-MAXBW: middle.block: +; CHECK-MAXBW-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP5]]) +; CHECK-MAXBW-NEXT: [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP7]], i32 7 +; CHECK-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] +; CHECK-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-MAXBW: scalar.ph: +; +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i32 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = sext i8 %b to i16 + %mul = mul i16 %conv, %conv + %mul.ext = zext i16 %mul to i32 + %add = add i32 %res2, %mul.ext + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %ec = icmp eq i64 %iv, %n + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %add +} + !7 = distinct !{!7, !8, !9, !10} !8 = !{!"llvm.loop.mustprogress"} !9 = !{!"llvm.loop.vectorize.predicate.enable", i1 true} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll index 33198e729e2cb..6df3f1b418eb6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll @@ -192,6 +192,91 @@ exit: ret i32 %add } +define i32 @print_partial_reduction_ext_mul(ptr %a, ptr %b) "target-features"="+neon,+dotprod" { +; CHECK: VPlan 'Initial VPlan for VF={8,16},UF>=1' { +; CHECK-NEXT: Live-in vp<[[VF:%.]]> = VF +; CHECK-NEXT: Live-in vp<[[VFxUF:%.]]> = VF * UF +; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count +; CHECK-NEXT: Live-in ir<1024> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<[[RDX_START:%.+]]> = reduction-start-vector ir<0>, ir<0>, ir<4> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4) +; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]> +; CHECK-NEXT: CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a> +; CHECK-NEXT: WIDEN ir<%load.a> = load vp<[[PTR_A]]> +; CHECK-NEXT: CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]> +; CHECK-NEXT: vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b> +; CHECK-NEXT: WIDEN ir<%load.b> = load vp<[[PTR_B]]> +; CHECK-NEXT: EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32)) +; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]> +; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]> +; CHECK-NEXT: EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]> +; CHECK-NEXT: EMIT branch-on-cond vp<[[CMP]]> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i32 [ %add, %for.body ] (extra operand: vp<[[RED_RESULT]]> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<[[VEC_TC]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<[[RED_RESULT]]>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] +; CHECK-NEXT: IR %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %gep.a = getelementptr i8, ptr %a, i64 %iv +; CHECK-NEXT: IR %load.a = load i8, ptr %gep.a, align 1 +; CHECK-NEXT: IR %ext.a = zext i8 %load.a to i16 +; CHECK-NEXT: IR %gep.b = getelementptr i8, ptr %b, i64 %iv +; CHECK-NEXT: IR %load.b = load i8, ptr %gep.b, align 1 +; CHECK-NEXT: IR %ext.b = zext i8 %load.b to i16 +; CHECK-NEXT: IR %mul = mul i16 %ext.b, %ext.a +; CHECK-NEXT: IR %mul.ext = zext i16 %mul to i32 +; CHECK-NEXT: IR %add = add i32 %mul.ext, %accum +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %exitcond.not = icmp eq i64 %iv.next, 1024 +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i16 + %gep.b = getelementptr i8, ptr %b, i64 %iv + %load.b = load i8, ptr %gep.b, align 1 + %ext.b = zext i8 %load.b to i16 + %mul = mul i16 %ext.b, %ext.a + %mul.ext = zext i16 %mul to i32 + %add = add i32 %mul.ext, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %for.body + +exit: + ret i32 %add +} !0 = distinct !{!0, !2, !3} !1 = distinct !{!1, !2, !4} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 1668fd49a22a3..85f1f48aab4d3 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -1080,3 +1080,93 @@ exit: %red.next.lcssa = phi i64 [ %red.next, %loop ] ret i64 %red.next.lcssa } + +; This reduce.add(ext(mul(ext(A), ext(B)))) can't be turned into an +; ExtMulAccReduction VPExpressionRecipe since the mul has two users. +; It can however be turned into an ExtendedReduction since that one doesn't +; modify the mul's operands. +define i64 @print_ext_mul_two_uses(i64 %n, ptr %a, i16 %b, i32 %c) { +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF * UF +; CHECK-NEXT: Live-in vp<%1> = vector-trip-count +; CHECK-NEXT: vp<%2> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: EMIT vp<%2> = EXPAND SCEV (1 + %n) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%3> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: WIDEN-CAST ir<%conv> = sext ir<%b> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%conv>, ir<%conv> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%4> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%res2> = phi vp<%3>, vp<%5> +; CHECK-NEXT: CLONE ir<%load> = load ir<%a> +; CHECK-NEXT: WIDEN-CAST ir<%load.ext> = sext ir<%load> to i32 +; CHECK-NEXT: WIDEN-CAST ir<%load.ext.ext> = sext ir<%load.ext> to i64 +; CHECK-NEXT: EXPRESSION vp<%5> = ir<%res2> + reduce.add (ir<%mul> zext to i64) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%4>, vp<%0> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%1> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%7> = compute-reduction-result ir<%res2>, vp<%5> +; CHECK-NEXT: EMIT vp<%vector.recur.extract> = extract-last-element ir<%load.ext.ext> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%2>, vp<%1> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %add.lcssa = phi i64 [ %add, %loop ] (extra operand: vp<%7> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%1>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%scalar.recur.init> = phi [ vp<%vector.recur.extract>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%7>, middle.block ], [ ir<0>, ir-bb ] +; CHECK-NEXT: Successor(s): ir-bb +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb: +; CHECK-NEXT: IR %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] (extra operand: vp<%scalar.recur.init> from scalar.ph) +; CHECK-NEXT: IR %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %load = load i16, ptr %a, align 2 +; CHECK-NEXT: IR %iv.next = add i64 %iv, 1 +; CHECK-NEXT: IR %conv = sext i16 %b to i32 +; CHECK-NEXT: IR %mul = mul i32 %conv, %conv +; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 +; CHECK-NEXT: IR %add = add i64 %res2, %mul.ext +; CHECK-NEXT: IR %second_use = or i32 %mul, %c +; CHECK-NEXT: IR %load.ext = sext i16 %load to i32 +; CHECK-NEXT: IR %load.ext.ext = sext i32 %load.ext to i64 +; CHECK-NEXT: IR %exitcond740.not = icmp eq i64 %iv, %n +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ] + %res2 = phi i64 [ 0, %entry ], [ %add, %loop ] + %load = load i16, ptr %a, align 2 + %iv.next = add i64 %iv, 1 + %conv = sext i16 %b to i32 + %mul = mul i32 %conv, %conv + %mul.ext = zext i32 %mul to i64 + %add = add i64 %res2, %mul.ext + %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test + %load.ext = sext i16 %load to i32 + %load.ext.ext = sext i32 %load.ext to i64 + %exitcond740.not = icmp eq i64 %iv, %n + br i1 %exitcond740.not, label %exit, label %loop + +exit: + ret i64 %add +}