From 68a526b079f30603cd6fd7aa376e5b5117ff4a01 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 6 Nov 2025 11:11:41 +0000 Subject: [PATCH 1/4] [AArch64] Enable maximize vector bandwidth for Neoverse-V1, V2 and N1 This PR enables maximizing vector bandwidth for the Neoverse-V1, V2 and N1 CPUs. --- .../AArch64/AArch64TargetTransformInfo.cpp | 13 +- .../AArch64/conditional-branches-cost.ll | 102 ++- .../AArch64/fully-unrolled-cost.ll | 2 +- .../AArch64/interleave-with-gaps.ll | 4 +- .../AArch64/partial-reduce-chained.ll | 314 ++++---- .../partial-reduce-dot-product-epilogue.ll | 32 +- .../AArch64/partial-reduce-dot-product.ll | 703 ++++++++++-------- .../AArch64/partial-reduce-sub.ll | 94 ++- .../LoopVectorize/AArch64/partial-reduce.ll | 492 +++++++----- .../LoopVectorize/AArch64/store-costs-sve.ll | 134 ++-- .../LoopVectorize/AArch64/sve2-histcnt.ll | 34 +- ...row-interleave-to-widen-memory-scalable.ll | 141 +++- .../AArch64/type-shrinkage-zext-costs.ll | 52 +- .../AArch64/wider-VF-for-callinst.ll | 31 +- 14 files changed, 1270 insertions(+), 878 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 0bae00bafee3c..4a606f242db01 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -375,8 +375,17 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call, bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( TargetTransformInfo::RegisterKind K) const { assert(K != TargetTransformInfo::RGK_Scalar); - return (K == TargetTransformInfo::RGK_FixedWidthVector && - ST->isNeonAvailable()); + + if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable()) + return true; + + switch (ST->getProcFamily()) { + case AArch64Subtarget::NeoverseN2: + return false; + default: + return K == TargetTransformInfo::RGK_ScalableVector && + ST->isSVEorStreamingSVEAvailable(); + } } /// Calculate the cost of materializing a 64-bit value. This helper diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 2e3ddc8899ec7..4f4f07e68ccfd 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -522,26 +522,82 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]] +; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP0]], 1 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP6]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 257, [[TMP3]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048 +; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP5]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1) -; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double> -; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256 -; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP8:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) +; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp [[TMP8]] to +; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 +; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP11]] +; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 3 +; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP14]] +; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 12 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP17]] +; DEFAULT-NEXT: store [[TMP9]], ptr [[NEXT_GEP]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP12]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP15]], align 8 +; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP18]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: -; DEFAULT-NEXT: br label %[[SCALAR_PH:.*]] -; DEFAULT: [[SCALAR_PH]]: +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]: +; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[N_VEC]], 8 +; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]] +; DEFAULT-NEXT: [[IND_END11:%.*]] = mul i64 [[N_VEC]], 2 +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP6]] +; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF24:![0-9]+]] +; DEFAULT: [[VEC_EPILOG_PH]]: +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 2 +; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 257, [[TMP22]] +; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 257, [[N_MOD_VF2]] +; DEFAULT-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC3]], 8 +; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] +; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC3]], 2 +; DEFAULT-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; DEFAULT: [[VEC_EPILOG_VECTOR_BODY]]: +; DEFAULT-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX4]], 8 +; DEFAULT-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] +; DEFAULT-NEXT: [[TMP26:%.*]] = load i16, ptr [[SRC]], align 2 +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[TMP26]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP27:%.*]] = or [[BROADCAST_SPLAT7]], splat (i16 1) +; DEFAULT-NEXT: [[TMP28:%.*]] = uitofp [[TMP27]] to +; DEFAULT-NEXT: store [[TMP28]], ptr [[NEXT_GEP5]], align 8 +; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], [[TMP22]] +; DEFAULT-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] +; DEFAULT-NEXT: br i1 [[TMP29]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] +; DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; DEFAULT-NEXT: [[CMP_N9:%.*]] = icmp eq i64 257, [[N_VEC3]] +; DEFAULT-NEXT: br i1 [[CMP_N9]], [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; DEFAULT: [[VEC_EPILOG_SCALAR_PH]]: ; ; PRED-LABEL: define void @multiple_exit_conditions( ; PRED-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { @@ -549,28 +605,28 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; PRED-NEXT: br label %[[VECTOR_PH:.*]] ; PRED: [[VECTOR_PH]]: ; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 +; PRED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 ; PRED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 1 +; PRED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 2 ; PRED-NEXT: [[TMP8:%.*]] = sub i64 257, [[TMP7]] ; PRED-NEXT: [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]] ; PRED-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0 -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257) +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 257) ; PRED-NEXT: br label %[[VECTOR_BODY:.*]] ; PRED: [[VECTOR_BODY]]: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ] ; PRED-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 ; PRED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] ; PRED-NEXT: [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[TMP13:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) -; PRED-NEXT: [[TMP14:%.*]] = uitofp [[TMP13]] to -; PRED-NEXT: call void @llvm.masked.store.nxv2f64.p0( [[TMP14]], ptr align 8 [[NEXT_GEP]], [[ACTIVE_LANE_MASK]]) +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP12]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[TMP11:%.*]] = or [[BROADCAST_SPLAT]], splat (i16 1) +; PRED-NEXT: [[TMP13:%.*]] = uitofp [[TMP11]] to +; PRED-NEXT: call void @llvm.masked.store.nxv4f64.p0( [[TMP13]], ptr align 8 [[NEXT_GEP]], [[ACTIVE_LANE_MASK]]) ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]]) -; PRED-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP10]]) +; PRED-NEXT: [[TMP15:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP16:%.*]] = xor i1 [[TMP15]], true ; PRED-NEXT: br i1 [[TMP16]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; PRED: [[MIDDLE_BLOCK]]: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll index 1164778c19070..f645db16ed3c6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll @@ -50,7 +50,7 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 { ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ] ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next> ; CHECK: Cost for VF 16: 57 -; CHECK: LV: Selecting VF: vscale x 2 +; CHECK: LV: Selecting VF: 16 entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll index 0c91661d20ae7..5b4bb70e6a479 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll @@ -3,9 +3,9 @@ target triple = "aarch64-linux-gnu" -; Original loop has trip count 16, but contains interleave groups with gaps, so +; Original loop has trip count 17, but contains interleave groups with gaps, so ; the last iteration must execute in the scalar loop. Thus the vector loop can -; only execute up to 15 iterations. +; only execute up to 16 iterations. define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i32 %x) #0 { ; CHECK-LABEL: define i64 @vector_loop_with_remaining_iterations( ; CHECK-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[X:%.*]]) #[[ATTR0:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll index 287226f14b753..dd0107b8c4bff 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll @@ -52,34 +52,38 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) -; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]] -; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw [[TMP13]], [[TMP10]] +; CHECK-SVE-NEXT: [[TMP12:%.*]] = sub zeroinitializer, [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -201,33 +205,37 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP12:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = mul nsw [[TMP12]], [[TMP14]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP15]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw [[TMP12]], [[TMP10]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP11]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -349,34 +357,38 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw zeroinitializer, [[TMP16]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP10]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw [[TMP13]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -502,35 +514,39 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]] -; CHECK-SVE-NEXT: [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP17:%.*]] = mul nsw [[TMP14]], [[TMP16]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw zeroinitializer, [[TMP17]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP10]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw [[TMP14]], [[TMP11]] +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sub zeroinitializer, [[TMP12]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP13]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -658,35 +674,39 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]]) -; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP16:%.*]] = mul nsw [[TMP13]], [[TMP15]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP16]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP11:%.*]] = mul nsw [[TMP13]], [[TMP10]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP11]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw [[TMP15]], [[TMP10]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP12]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) +; CHECK-SVE-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -818,37 +838,41 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]]) -; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]] -; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP17:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP18:%.*]] = mul nsw [[TMP15]], [[TMP17]] +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sub nsw zeroinitializer, [[TMP18]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP10]]) +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[TMP12:%.*]] = mul nsw [[TMP15]], [[TMP11]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP12]]) +; CHECK-SVE-NEXT: [[TMP13:%.*]] = mul nsw [[TMP17]], [[TMP11]] +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sub zeroinitializer, [[TMP13]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE4]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE3]], [[TMP14]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]]) +; CHECK-SVE-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE4]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -978,32 +1002,36 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = mul nsw [[TMP11]], [[TMP13]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP14]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1118,28 +1146,32 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 { ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[TMP6:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP6]]) +; CHECK-SVE-NEXT: [[TMP10:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE2]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]]) +; CHECK-SVE-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE2]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: @@ -1251,32 +1283,36 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) # ; CHECK-SVE-NEXT: [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2 ; CHECK-SVE-NEXT: [[DIV27:%.*]] = lshr i32 [[N]], 1 ; CHECK-SVE-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64 -; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]] ; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-SVE: vector.ph: -; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16 +; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-SVE-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]] ; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]] ; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-SVE: vector.body: ; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-SVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]] ; CHECK-SVE-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]] -; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-SVE-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) -; CHECK-SVE-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-SVE-NEXT: [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]] -; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]]) -; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-SVE-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-SVE-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE:%.*]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP11]]) +; CHECK-SVE-NEXT: [[TMP13:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-SVE-NEXT: [[TMP14:%.*]] = sext [[WIDE_LOAD1]] to +; CHECK-SVE-NEXT: [[TMP10:%.*]] = mul nsw [[TMP13]], [[TMP14]] +; CHECK-SVE-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[PARTIAL_REDUCE]], [[TMP10]]) +; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] ; CHECK-SVE-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; CHECK-SVE: middle.block: -; CHECK-SVE-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]]) +; CHECK-SVE-NEXT: [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE3]]) ; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK-SVE: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll index 5b9bd0997f2fa..fed979e34e5d5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll @@ -10,26 +10,30 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]] -; CHECK-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]]) -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = mul [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP9]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK: for.exit: -; CHECK-NEXT: ret i32 [[TMP11]] +; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK: scalar.ph: ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 71eb5476b7ac5..5380307215d3f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -12,62 +12,74 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP16]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP6]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP20]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]]) -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP28]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP28]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul [[TMP10]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul [[TMP13]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { @@ -162,16 +174,16 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 48 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 32 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 48 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1 @@ -299,16 +311,16 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 24 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 24 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2 @@ -525,7 +537,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> @@ -789,13 +801,13 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> @@ -991,18 +1003,22 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVE1-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP12]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1014,38 +1030,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]]) -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]]) -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]]) -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = sext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI3]], [[TMP17]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = sext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = sext [[WIDE_LOAD6]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = mul nsw [[TMP18]], [[TMP19]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE7]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI2]], [[TMP20]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP23:%.*]] = mul nsw [[TMP21]], [[TMP22]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE10]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP23]]) +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD11:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = sext [[WIDE_LOAD11]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = mul nsw [[TMP24]], [[TMP25]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE13]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP26]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-INTERLEAVE1-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE13]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE10]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP26:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE7]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP27:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE13]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE10]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1053,22 +1069,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-LABEL: define i32 @dotp_unrolled( ; CHECK-INTERLEAVED-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP0]], 5 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul nuw i64 [[TMP13]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE28:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE29:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE22:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE23:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI4:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI5:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI6:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI7:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = or disjoint i64 [[INDEX]], 1 @@ -1080,74 +1100,90 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 3 ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]]) -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]]) -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]]) -; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]]) -; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]]) -; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]]) -; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]]) -; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP17]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = shl nuw i64 [[TMP18]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP20]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = sext [[WIDE_LOAD9]] to +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = mul nsw [[TMP21]], [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI6]], [[TMP23]]) +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = sext [[WIDE_LOAD10]] to +; CHECK-INTERLEAVED-NEXT: [[TMP25:%.*]] = sext [[WIDE_LOAD8]] to +; CHECK-INTERLEAVED-NEXT: [[TMP26:%.*]] = mul nsw [[TMP24]], [[TMP25]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE11]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI7]], [[TMP26]]) +; CHECK-INTERLEAVED-NEXT: [[TMP27:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP28:%.*]] = shl nuw i64 [[TMP27]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP28]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD12:%.*]] = load , ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD13:%.*]] = load , ptr [[TMP29]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = shl nuw i64 [[TMP30]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP63:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 [[TMP31]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD14:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD15:%.*]] = load , ptr [[TMP63]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP33:%.*]] = sext [[WIDE_LOAD12]] to +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = sext [[WIDE_LOAD14]] to +; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = mul nsw [[TMP33]], [[TMP34]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE16]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI4]], [[TMP35]]) +; CHECK-INTERLEAVED-NEXT: [[TMP36:%.*]] = sext [[WIDE_LOAD13]] to +; CHECK-INTERLEAVED-NEXT: [[TMP37:%.*]] = sext [[WIDE_LOAD15]] to +; CHECK-INTERLEAVED-NEXT: [[TMP38:%.*]] = mul nsw [[TMP36]], [[TMP37]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE17]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI5]], [[TMP38]]) +; CHECK-INTERLEAVED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP40:%.*]] = shl nuw i64 [[TMP39]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP41:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 [[TMP40]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD18:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD19:%.*]] = load , ptr [[TMP41]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP43:%.*]] = shl nuw i64 [[TMP42]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 [[TMP43]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD20:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD21:%.*]] = load , ptr [[TMP44]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = sext [[WIDE_LOAD18]] to +; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = sext [[WIDE_LOAD20]] to +; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = mul nsw [[TMP45]], [[TMP46]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE22]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI2]], [[TMP47]]) +; CHECK-INTERLEAVED-NEXT: [[TMP48:%.*]] = sext [[WIDE_LOAD19]] to +; CHECK-INTERLEAVED-NEXT: [[TMP49:%.*]] = sext [[WIDE_LOAD21]] to +; CHECK-INTERLEAVED-NEXT: [[TMP50:%.*]] = mul nsw [[TMP48]], [[TMP49]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE23]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI3]], [[TMP50]]) +; CHECK-INTERLEAVED-NEXT: [[TMP51:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP52:%.*]] = shl nuw i64 [[TMP51]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP53:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP52]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD24:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD25:%.*]] = load , ptr [[TMP53]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP54:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP55:%.*]] = shl nuw i64 [[TMP54]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP56:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 [[TMP55]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD26:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD27:%.*]] = load , ptr [[TMP56]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP57:%.*]] = sext [[WIDE_LOAD24]] to +; CHECK-INTERLEAVED-NEXT: [[TMP58:%.*]] = sext [[WIDE_LOAD26]] to +; CHECK-INTERLEAVED-NEXT: [[TMP59:%.*]] = mul nsw [[TMP57]], [[TMP58]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE28]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP59]]) +; CHECK-INTERLEAVED-NEXT: [[TMP60:%.*]] = sext [[WIDE_LOAD25]] to +; CHECK-INTERLEAVED-NEXT: [[TMP61:%.*]] = sext [[WIDE_LOAD27]] to +; CHECK-INTERLEAVED-NEXT: [[TMP62:%.*]] = mul nsw [[TMP60]], [[TMP61]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE29]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP62]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP14]] ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP32]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] -; CHECK-INTERLEAVED-NEXT: [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add <4 x i32> [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]] -; CHECK-INTERLEAVED-NEXT: [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX30]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add <4 x i32> [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] -; CHECK-INTERLEAVED-NEXT: [[TMP46:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX31]]) -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add <4 x i32> [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP47:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX32]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE29]], [[PARTIAL_REDUCE28]] +; CHECK-INTERLEAVED-NEXT: [[TMP64:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX30:%.*]] = add [[PARTIAL_REDUCE23]], [[PARTIAL_REDUCE22]] +; CHECK-INTERLEAVED-NEXT: [[TMP65:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX30]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX31:%.*]] = add [[PARTIAL_REDUCE17]], [[PARTIAL_REDUCE16]] +; CHECK-INTERLEAVED-NEXT: [[TMP66:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX31]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX32:%.*]] = add [[PARTIAL_REDUCE11]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP67:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX32]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -1280,32 +1316,32 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVE1-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul nsw [[TMP10]], [[TMP13]] +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: ; CHECK-INTERLEAVE1-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVE1-NEXT: br label [[EXIT:%.*]] @@ -1318,32 +1354,32 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 ; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sub i64 [[N]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp ugt i64 [[N]], [[TMP6]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP7]], i64 0 -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[N]]) +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[N]]) ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = add [[TMP17]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP19]] = select [[ACTIVE_LANE_MASK]], [[TMP18]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw [[TMP10]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP19]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP9]]) -; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = xor i1 [[TMP20]], true -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: ; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP19]]) ; CHECK-INTERLEAVED-NEXT: br label [[EXIT:%.*]] @@ -1370,10 +1406,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP18]]) @@ -1416,66 +1452,82 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD]] to ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP9]], [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP6]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = mul [[TMP5]], [[TMP4]] +; CHECK-INTERLEAVE1-NEXT: [[TMP7]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = extractelement <16 x i32> [[TMP9]], i32 15 -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: [[RESULT:%.*]] = add i32 [[TMP7]], [[TMP8]] -; CHECK-INTERLEAVE1-NEXT: ret i32 [[RESULT]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP7]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = mul nuw i32 [[TMP10]], 8 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = sub i32 [[TMP11]], 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = extractelement [[TMP5]], i32 [[TMP12]] +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @not_dotp_extend_user( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP24]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP3]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP10]] = add <16 x i32> [[TMP15]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add <16 x i32> [[TMP9]], [[VEC_PHI1]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = mul [[TMP12]], [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul [[TMP13]], [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP16]] = add [[TMP14]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[TMP17]] = add [[TMP15]], [[VEC_PHI1]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP11]], [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = extractelement <16 x i32> [[TMP7]], i32 15 -; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: [[RESULT:%.*]] = add i32 [[TMP13]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: ret i32 [[RESULT]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP17]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = sub i32 [[TMP21]], 1 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = extractelement [[TMP13]], i32 [[TMP22]] +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @not_dotp_extend_user( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { @@ -1538,62 +1590,83 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVE1-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: -; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul nuw nsw [[TMP12]], [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP9]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: -; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 5 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP6]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1 ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]]) -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: br i1 true, label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP15]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP12]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = mul nuw nsw [[TMP13]], [[TMP21]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP22]]) +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = mul nuw nsw [[TMP16]], [[TMP17]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI1]], [[TMP18]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement( @@ -1880,7 +1953,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> @@ -2009,7 +2082,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> @@ -2093,32 +2166,36 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVE1: for.body.preheader: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[NEXT_GEP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = mul nuw nsw [[TMP14]], [[TMP10]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP11]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -2130,42 +2207,50 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 { ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]] ; CHECK-INTERLEAVED: for.body.preheader: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 -; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 5 +; CHECK-INTERLEAVED-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]] ; CHECK-INTERLEAVED-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> zeroinitializer, i64 [[COST]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = insertelement zeroinitializer, i64 [[COST]], i32 0 ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP21]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]]) -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP23:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP23]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP12]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i64 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[NEXT_GEP2]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = mul nuw nsw [[TMP15]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = mul nuw nsw [[TMP18]], [[TMP19]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE6]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI1]], [[TMP20]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP22:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll index 1f5d5f0ea218e..66b8026d46704 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll @@ -12,65 +12,77 @@ define i32 @dotp(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]] -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = zext [[WIDE_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = mul [[TMP4]], [[TMP5]] +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = sub zeroinitializer, [[TMP6]] +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP11]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVE1: for.exit: -; CHECK-INTERLEAVE1-NEXT: ret i32 [[TMP8]] +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @dotp( ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP7]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]]) -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]] -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP20]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP14]], i64 [[TMP8]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP14]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP9]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = zext [[WIDE_LOAD3]] to +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul [[TMP10]], [[TMP11]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sub zeroinitializer, [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP13]]) +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = mul [[TMP21]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = sub zeroinitializer, [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE5]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP17]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[FOR_EXIT:%.*]] -; CHECK-INTERLEAVED: for.exit: -; CHECK-INTERLEAVED-NEXT: ret i32 [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @dotp( ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index dd2fe09a9e593..6e6137957f2f5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -14,20 +14,25 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @zext_add_reduc_i8_i32_sve( @@ -35,26 +40,33 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP0]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @zext_add_reduc_i8_i32_sve( @@ -134,7 +146,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> @@ -196,20 +208,25 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP3]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i8_i64( @@ -217,26 +234,33 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]]) -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI]], [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64( [[VEC_PHI1]], [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i8_i64( @@ -290,20 +314,25 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 2 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i64 @zext_add_reduc_i16_i64( @@ -311,26 +340,33 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[TMP1]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]]) -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[TMP1]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 2 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 2 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI]], [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = zext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv2i64.nxv8i64( [[VEC_PHI1]], [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i64 @zext_add_reduc_i16_i64( @@ -413,9 +449,9 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 @@ -486,21 +522,21 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1025) ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[VEC_PHI]] -; CHECK-INTERLEAVE1-NEXT: [[TMP10]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], [[VEC_PHI]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP3]], zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP10]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP4]]) ; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = xor i1 [[TMP11]], true ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: @@ -515,21 +551,21 @@ define i32 @zext_add_reduc_i8_i32_predicated(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 4 -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 1025) +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1025) ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i8.p0(ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = zext [[WIDE_MASKED_LOAD]] to -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = add [[TMP8]], [[VEC_PHI]] -; CHECK-INTERLEAVED-NEXT: [[TMP10]] = select [[ACTIVE_LANE_MASK]], [[TMP9]], [[VEC_PHI]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]], poison) +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext [[WIDE_MASKED_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP3]], zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP10]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP4]]) ; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX_NEXT]], i64 1025) -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 1025) +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = xor i1 [[TMP11]], true ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: @@ -674,20 +710,25 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 8 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = sub <16 x i32> [[VEC_PHI]], [[TMP3]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP4]] = sub [[VEC_PHI]], [[TMP3]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[TMP4]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod( @@ -695,38 +736,49 @@ define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = sub <16 x i32> [[VEC_PHI]], [[TMP4]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = sub <16 x i32> [[VEC_PHI1]], [[TMP5]] -; CHECK-INTERLEAVED-NEXT: [[TMP10]] = sub <16 x i32> [[VEC_PHI2]], [[TMP12]] -; CHECK-INTERLEAVED-NEXT: [[TMP11]] = sub <16 x i32> [[VEC_PHI3]], [[TMP14]] -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 3 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = shl nuw i64 [[TMP6]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = mul nuw i64 [[TMP9]], 24 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP8]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP11]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = zext [[WIDE_LOAD4]] to +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = zext [[WIDE_LOAD5]] to +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[WIDE_LOAD6]] to +; CHECK-INTERLEAVED-NEXT: [[TMP16]] = sub [[VEC_PHI]], [[TMP12]] +; CHECK-INTERLEAVED-NEXT: [[TMP17]] = sub [[VEC_PHI1]], [[TMP13]] +; CHECK-INTERLEAVED-NEXT: [[TMP18]] = sub [[VEC_PHI2]], [[TMP14]] +; CHECK-INTERLEAVED-NEXT: [[TMP19]] = sub [[VEC_PHI3]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add <16 x i32> [[TMP10]], [[BIN_RDX]] -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add <16 x i32> [[TMP11]], [[BIN_RDX7]] -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX8]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP17]], [[TMP16]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX7:%.*]] = add [[TMP18]], [[BIN_RDX]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX8:%.*]] = add [[TMP19]], [[BIN_RDX7]] +; CHECK-INTERLEAVED-NEXT: [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32( [[BIN_RDX8]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @zext_sub_reduc_i8_i32_has_neon_dotprod( @@ -780,20 +832,25 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: +; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-INTERLEAVE1-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP3]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) -; CHECK-INTERLEAVE1-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVE1: scalar.ph: ; ; CHECK-INTERLEAVED-LABEL: define i32 @sext_add_reduc_i8_i32( @@ -801,26 +858,33 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP0]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 -; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]]) -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]]) -; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP1]], i64 [[TMP4]] +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP5]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = sext [[WIDE_LOAD2]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE3]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI1]], [[TMP7]]) +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) -; CHECK-INTERLEAVED-NEXT: br label [[SCALAR_PH:%.*]] +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE3]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i64 1025, [[N_VEC]] +; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]] ; CHECK-INTERLEAVED: scalar.ph: ; ; CHECK-MAXBW-LABEL: define i32 @sext_add_reduc_i8_i32( @@ -874,28 +938,32 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32 ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[TMP5]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP7]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV1]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = insertelement zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV1]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP9]], [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] -; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]] -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: store zeroinitializer, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP8]] = add [[VEC_PHI]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]] ; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[TMP8]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -905,33 +973,39 @@ define i32 @add_of_zext_outside_loop(i32 %a, ptr noalias %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[CONV1:%.*]] = zext i8 [[C]] to i32 ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sub i32 1024, [[D]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP5]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP2]], [[TMP7]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul nuw i32 [[TMP13]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = insertelement <16 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[CONV1]], i64 0 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV1]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16 -; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP3]], align 1 -; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP5]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI]], [[BROADCAST_SPLAT]] -; CHECK-INTERLEAVED-NEXT: [[TMP7]] = add <16 x i32> [[VEC_PHI2]], [[BROADCAST_SPLAT]] -; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add nuw i32 [[VEC_PHI1]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = shl nuw i64 [[TMP16]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 [[TMP9]] +; CHECK-INTERLEAVED-NEXT: store zeroinitializer, ptr [[TMP3]], align 1 +; CHECK-INTERLEAVED-NEXT: store zeroinitializer, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15]] = add [[VEC_PHI]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP12]] = add [[VEC_PHI2]], [[BROADCAST_SPLAT]] +; CHECK-INTERLEAVED-NEXT: [[TMP21]] = add nuw i32 [[VEC_PHI1]], [[TMP4]] ; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP21]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <16 x i32> [[TMP7]], [[TMP6]] -; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[TMP12]], [[TMP15]] +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv16i32( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -995,29 +1069,33 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVE1-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVE1-NEXT: entry: ; CHECK-INTERLEAVE1-NEXT: [[TMP0:%.*]] = sub i32 1024, [[D]] -; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = shl nuw nsw i32 [[TMP5]], 4 +; CHECK-INTERLEAVE1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP0]], [[TMP2]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVE1: vector.ph: -; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16 +; CHECK-INTERLEAVE1-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVE1-NEXT: [[TMP9:%.*]] = mul nuw i32 [[TMP3]], 16 +; CHECK-INTERLEAVE1-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]] ; CHECK-INTERLEAVE1-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVE1-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[C]], i64 0 +; CHECK-INTERLEAVE1-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-INTERLEAVE1-NEXT: br label [[FOR_BODY:%.*]] ; CHECK-INTERLEAVE1: vector.body: ; CHECK-INTERLEAVE1-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] +; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] -; CHECK-INTERLEAVE1-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVE1-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> -; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) -; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 +; CHECK-INTERLEAVE1-NEXT: store zeroinitializer, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVE1-NEXT: [[TMP8:%.*]] = zext [[BROADCAST_SPLAT]] to +; CHECK-INTERLEAVE1-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP8]]) +; CHECK-INTERLEAVE1-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP9]] ; CHECK-INTERLEAVE1-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVE1: middle.block: -; CHECK-INTERLEAVE1-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]]) +; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[PARTIAL_REDUCE]]) ; CHECK-INTERLEAVE1-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; CHECK-INTERLEAVE1-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVE1: scalar.ph: @@ -1026,34 +1104,40 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 { ; CHECK-INTERLEAVED-SAME: i32 [[A:%.*]], ptr [[B:%.*]], i8 [[C:%.*]], i32 [[D:%.*]]) #[[ATTR0]] { ; CHECK-INTERLEAVED-NEXT: entry: ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = sub i32 1024, [[D]] -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP5]], 5 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = icmp ult i32 [[TMP2]], [[TMP7]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP11]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK-INTERLEAVED: vector.ph: -; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = mul nuw i32 [[TMP3]], 32 +; CHECK-INTERLEAVED-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0 -; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = insertelement zeroinitializer, i32 [[A]], i32 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i8 [[C]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer ; CHECK-INTERLEAVED-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] -; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]] -; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16 -; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1 -; CHECK-INTERLEAVED-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32> -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]]) -; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]]) -; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP8]], 4 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 [[TMP14]] +; CHECK-INTERLEAVED-NEXT: store zeroinitializer, ptr [[TMP4]], align 1 +; CHECK-INTERLEAVED-NEXT: store zeroinitializer, ptr [[TMP10]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = zext [[BROADCAST_SPLAT]] to +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[PARTIAL_REDUCE2]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI2]], [[TMP15]]) +; CHECK-INTERLEAVED-NEXT: [[TMP22]] = add nuw i32 [[VEC_PHI1]], [[TMP12]] ; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; CHECK-INTERLEAVED: middle.block: -; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add <4 x i32> [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] -; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[BIN_RDX]]) +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add [[PARTIAL_REDUCE2]], [[PARTIAL_REDUCE]] +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[BIN_RDX]]) ; CHECK-INTERLEAVED-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]] ; CHECK-INTERLEAVED-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK-INTERLEAVED: scalar.ph: @@ -1156,9 +1240,9 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 4 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 8 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 12 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index bf4ab32fbf9e4..48297937ae79a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -9,15 +9,15 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-LABEL: define void @cost_store_i8( ; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] { ; DEFAULT-NEXT: iter.check: +; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP10]], 3 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP13]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] +; DEFAULT: vector.main.loop.iter.check: ; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 2 +; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] -; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: [[TMP9:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 5 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP10]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: ; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 @@ -39,22 +39,22 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]] ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP1]] +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP13]] ; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]] ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; DEFAULT-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP14]], 4 -; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP15]] +; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 +; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP12]] ; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]] ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: -; DEFAULT-NEXT: [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]] -; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP19]], align 1 -; DEFAULT-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], [[TMP15]] -; DEFAULT-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]] -; DEFAULT-NEXT: br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; DEFAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]] +; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 +; DEFAULT-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP12]] +; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]] +; DEFAULT-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: [[CMP_N6:%.*]] = icmp eq i64 101, [[N_VEC3]] ; DEFAULT-NEXT: br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -128,52 +128,63 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]] ; DEFAULT-NEXT: br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: -; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] +; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5 +; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1000, [[TMP1]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; DEFAULT: vector.ph: -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT2]], <16 x i64> poison, <16 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT3]] to <16 x i8> -; DEFAULT-NEXT: [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8> -; DEFAULT-NEXT: [[TMP3:%.*]] = and <16 x i8> [[TMP2]], [[TMP0]] +; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32 +; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 1000, [[TMP3]] +; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP13:%.*]] = trunc [[BROADCAST_SPLAT]] to ; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] ; DEFAULT: vector.body: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; DEFAULT-NEXT: [[TMP5:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; DEFAULT-NEXT: [[TMP6:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; DEFAULT-NEXT: [[TMP14:%.*]] = and [[TMP6]], [[TMP13]] ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] -; DEFAULT-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i64 16 -; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] -; DEFAULT-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP5]], align 1, !alias.scope [[META9]], !noalias [[META6]] -; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 -; DEFAULT-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992 -; DEFAULT-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() +; DEFAULT-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 +; DEFAULT-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[TMP4]], i64 [[TMP16]] +; DEFAULT-NEXT: store [[TMP14]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]] +; DEFAULT-NEXT: store [[TMP14]], ptr [[TMP17]], align 1, !alias.scope [[META9]], !noalias [[META6]] +; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]] +; DEFAULT-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; DEFAULT-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; DEFAULT: middle.block: -; DEFAULT-NEXT: br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 1000, [[N_VEC]] +; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: -; DEFAULT-NEXT: br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]] +; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 +; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] ; DEFAULT: vec.epilog.ph: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6]] -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> ; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> -; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]] ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] -; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP13]], align 1, !alias.scope [[META9]], !noalias [[META6]] +; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META12:![0-9]+]] +; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0 +; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer +; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]] +; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] +; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] ; DEFAULT-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8 ; DEFAULT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 1000 -; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: -; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] +; DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1000, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ] ; DEFAULT-NEXT: br label [[LOOP:%.*]] ; DEFAULT: loop: ; DEFAULT-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ] @@ -185,7 +196,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; @@ -202,25 +213,25 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; PRED-NEXT: br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; PRED: vector.ph: ; PRED-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; PRED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2 -; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1000) -; PRED-NEXT: [[TMP2:%.*]] = trunc [[BROADCAST_SPLAT]] to +; PRED-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16 +; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 1000) +; PRED-NEXT: [[TMP2:%.*]] = trunc [[BROADCAST_SPLAT]] to ; PRED-NEXT: br label [[VECTOR_BODY:%.*]] ; PRED: vector.body: ; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] ; PRED-NEXT: [[TMP3:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]] -; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 -; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer -; PRED-NEXT: [[TMP4:%.*]] = trunc [[BROADCAST_SPLAT3]] to -; PRED-NEXT: [[TMP5:%.*]] = and [[TMP4]], [[TMP2]] +; PRED-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP3]], i64 0 +; PRED-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; PRED-NEXT: [[TMP4:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; PRED-NEXT: [[TMP5:%.*]] = and [[TMP4]], [[TMP2]] ; PRED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] -; PRED-NEXT: call void @llvm.masked.store.nxv2i8.p0( [[TMP5]], ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]] +; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0( [[TMP5]], ptr align 1 [[TMP6]], [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]] ; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]] -; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000) -; PRED-NEXT: [[TMP7:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 +; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 1000) +; PRED-NEXT: [[TMP7:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i32 0 ; PRED-NEXT: [[TMP8:%.*]] = xor i1 [[TMP7]], true ; PRED-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; PRED: middle.block: @@ -268,7 +279,7 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 4, i32 28} +; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 8, i32 24} ; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; DEFAULT: [[META6]] = !{[[META7:![0-9]+]]} @@ -277,9 +288,8 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[META9]] = !{[[META10:![0-9]+]]} ; DEFAULT: [[META10]] = distinct !{[[META10]], [[META8]]} ; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; DEFAULT: [[PROF12]] = !{!"branch_weights", i32 8, i32 24} -; DEFAULT: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]]} +; DEFAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll index 871d9be609bd7..873b18beb85aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll @@ -238,31 +238,37 @@ for.exit: define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 { ; CHECK-LABEL: define void @histogram_8bit( ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: +; CHECK-NEXT: iter.check: ; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] +; CHECK: vector.main.loop.iter.check: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = shl nuw nsw i64 [[TMP2]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP6]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2 -; CHECK-NEXT: [[DOTNOT:%.*]] = sub nsw i64 0, [[TMP4]] +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[DOTNOT:%.*]] = add nsw i64 [[TMP4]], -1 ; CHECK-NEXT: [[N_VEC:%.*]] = and i64 [[N]], [[DOTNOT]] +; CHECK-NEXT: [[N_VEC1:%.*]] = sub i64 [[N]], [[N_VEC]] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP6]] -; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv4p0.i8( [[TMP7]], i8 1, splat (i1 true)) +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], [[TMP8]] +; CHECK-NEXT: call void @llvm.experimental.vector.histogram.add.nxv16p0.i8( [[TMP17]], i8 1, splat (i1 true)) ; CHECK-NEXT: [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC1]] +; CHECK-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: +; CHECK-NEXT: [[CMP_N1:%.*]] = icmp eq i64 [[N_VEC]], 0 +; CHECK-NEXT: br i1 [[CMP_N1]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] +; CHECK: vec.epilog.iter.check: ; entry: br label %for.body diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll index b63e03dccdc18..d82dace2c9e04 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll @@ -172,14 +172,13 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) ; ; CHECK-LABEL: define void @test_masked_interleave_group( ; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2 -; CHECK-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8) +; CHECK-NEXT: [[UMAX:%.*]] = shl nuw i64 [[TMP2]], 3 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]] ; CHECK: [[VECTOR_MEMCHECK]]: ; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[N]] to i64 ; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4 @@ -195,45 +194,129 @@ define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) ; CHECK-NEXT: [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]] ; CHECK-NEXT: [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]] ; CHECK-NEXT: [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]] -; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]] -; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: br i1 [[CONFLICT_RDX]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] +; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw i64 [[TMP8]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK6:%.*]] = icmp ult i64 [[TMP1]], [[TMP20]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK6]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP9:%.*]] = mul nuw i64 [[TMP21]], 16 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]] +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP25:%.*]] = mul i64 [[INDEX1]], 16 +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP25]] +; CHECK-NEXT: [[TMP26:%.*]] = mul i64 [[INDEX1]], 16 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP26]] +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX1]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP10]], align 1, !alias.scope [[META6:![0-9]+]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP27]], [[TMP27]], [[TMP27]], [[TMP27]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64f32.p0(ptr align 4 [[NEXT_GEP9]], [[INTERLEAVED_MASK]], poison), !alias.scope [[META9:![0-9]+]] +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv64f32( [[WIDE_MASKED_VEC]]) +; CHECK-NEXT: [[TMP28:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv64f32( [[TMP28]], [[TMP16]], [[TMP17]], [[TMP18]]) +; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave4.nxv64i1( [[TMP27]], [[TMP27]], [[TMP27]], [[TMP27]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv64f32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP1]], [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX1]], [[TMP9]] +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] +; CHECK: [[VEC_EPILOG_ITER_CHECK]]: ; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32 ; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[N_VEC]], 16 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[N_VEC]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]] -; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] -; CHECK: [[VECTOR_BODY]]: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[UMAX]] +; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]] +; CHECK: [[VEC_EPILOG_PH]]: +; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] +; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP23:%.*]] = mul nuw i64 [[TMP22]], 8 +; CHECK-NEXT: [[N_MOD_VF10:%.*]] = urem i64 [[TMP1]], [[TMP23]] +; CHECK-NEXT: [[INDEX:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF10]] +; CHECK-NEXT: [[TMP24:%.*]] = trunc i64 [[INDEX]] to i32 ; CHECK-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16 ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] ; CHECK-NEXT: [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16 ; CHECK-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]] ; CHECK-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]] -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq [[WIDE_LOAD]], zeroinitializer -; CHECK-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) -; CHECK-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], [[INTERLEAVED_MASK]], poison), !alias.scope [[META9:![0-9]+]] -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv16f32( [[WIDE_MASKED_VEC]]) -; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 2 -; CHECK-NEXT: [[TMP20:%.*]] = extractvalue { , , , } [[STRIDED_VEC]], 3 -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave4.nxv16f32( [[TMP17]], [[TMP18]], [[TMP19]], [[TMP20]]) -; CHECK-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave4.nxv16i1( [[TMP16]], [[TMP16]], [[TMP16]], [[TMP16]]) -; CHECK-NEXT: call void @llvm.masked.store.nxv16f32.p0( [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]] -; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] -; CHECK: [[MIDDLE_BLOCK]]: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] -; CHECK: [[SCALAR_PH]]: +; CHECK-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] +; CHECK: [[VEC_EPILOG_VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX12:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT23:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX12]], 16 +; CHECK-NEXT: [[NEXT_GEP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] +; CHECK-NEXT: [[OFFSET_IDX14:%.*]] = mul i64 [[INDEX12]], 16 +; CHECK-NEXT: [[NEXT_GEP15:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX14]] +; CHECK-NEXT: [[NEXT_GEP16:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX12]] +; CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load , ptr [[NEXT_GEP16]], align 1, !alias.scope [[META6]] +; CHECK-NEXT: [[TMP30:%.*]] = icmp eq [[WIDE_LOAD17]], zeroinitializer +; CHECK-NEXT: [[INTERLEAVED_MASK18:%.*]] = call @llvm.vector.interleave4.nxv32i1( [[TMP30]], [[TMP30]], [[TMP30]], [[TMP30]]) +; CHECK-NEXT: [[WIDE_MASKED_VEC19:%.*]] = call @llvm.masked.load.nxv32f32.p0(ptr align 4 [[NEXT_GEP15]], [[INTERLEAVED_MASK18]], poison), !alias.scope [[META9]] +; CHECK-NEXT: [[STRIDED_VEC20:%.*]] = call { , , , } @llvm.vector.deinterleave4.nxv32f32( [[WIDE_MASKED_VEC19]]) +; CHECK-NEXT: [[TMP31:%.*]] = extractvalue { , , , } [[STRIDED_VEC20]], 0 +; CHECK-NEXT: [[TMP32:%.*]] = extractvalue { , , , } [[STRIDED_VEC20]], 1 +; CHECK-NEXT: [[TMP33:%.*]] = extractvalue { , , , } [[STRIDED_VEC20]], 2 +; CHECK-NEXT: [[TMP34:%.*]] = extractvalue { , , , } [[STRIDED_VEC20]], 3 +; CHECK-NEXT: [[INTERLEAVED_VEC21:%.*]] = call @llvm.vector.interleave4.nxv32f32( [[TMP31]], [[TMP32]], [[TMP33]], [[TMP34]]) +; CHECK-NEXT: [[INTERLEAVED_MASK22:%.*]] = call @llvm.vector.interleave4.nxv32i1( [[TMP30]], [[TMP30]], [[TMP30]], [[TMP30]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv32f32.p0( [[INTERLEAVED_VEC21]], ptr align 4 [[NEXT_GEP13]], [[INTERLEAVED_MASK22]]), !alias.scope [[META11]], !noalias [[META13]] +; CHECK-NEXT: [[INDEX_NEXT23]] = add nuw i64 [[INDEX12]], [[TMP23]] +; CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[INDEX_NEXT23]], [[INDEX]] +; CHECK-NEXT: br i1 [[TMP35]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]: +; CHECK-NEXT: [[CMP_N24:%.*]] = icmp eq i64 [[TMP1]], [[INDEX]] +; CHECK-NEXT: br i1 [[CMP_N24]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] +; CHECK: [[VEC_EPILOG_SCALAR_PH]]: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[TMP24]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP10]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MEMCHECK]] ], [ 0, %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL25:%.*]] = phi ptr [ [[NEXT_GEP]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP12]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[DST]], %[[VECTOR_MEMCHECK]] ], [ [[DST]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL26:%.*]] = phi ptr [ [[NEXT_GEP7]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP14]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[SRC]], %[[VECTOR_MEMCHECK]] ], [ [[SRC]], %[[ENTRY]] ] +; CHECK-NEXT: [[BC_RESUME_VAL27:%.*]] = phi ptr [ [[NEXT_GEP8]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[TMP15]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[MASK]], %[[VECTOR_MEMCHECK]] ], [ [[MASK]], %[[ENTRY]] ] +; CHECK-NEXT: br label %[[LOOP_HEADER:.*]] +; CHECK: [[LOOP_HEADER]]: +; CHECK-NEXT: [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP_LATCH:.*]] ] +; CHECK-NEXT: [[DST_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL25]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[DST_IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[SRC_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL26]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[SRC_IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[MASK_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL27]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[MASK_IV_NEXT:%.*]], %[[LOOP_LATCH]] ] +; CHECK-NEXT: [[MASK_IV_NEXT]] = getelementptr i8, ptr [[MASK_IV]], i64 1 +; CHECK-NEXT: [[MASK_VAL:%.*]] = load i8, ptr [[MASK_IV]], align 1 +; CHECK-NEXT: [[SHOULD_COPY:%.*]] = icmp eq i8 [[MASK_VAL]], 0 +; CHECK-NEXT: br i1 [[SHOULD_COPY]], label %[[THEN:.*]], label %[[LOOP_LATCH]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[ELEM0:%.*]] = load float, ptr [[SRC_IV]], align 4 +; CHECK-NEXT: store float [[ELEM0]], ptr [[DST_IV]], align 4 +; CHECK-NEXT: [[SRC_1_PTR:%.*]] = getelementptr i8, ptr [[SRC_IV]], i64 4 +; CHECK-NEXT: [[S1:%.*]] = load float, ptr [[SRC_1_PTR]], align 4 +; CHECK-NEXT: [[DST_1_PTR:%.*]] = getelementptr i8, ptr [[DST_IV]], i64 4 +; CHECK-NEXT: store float [[S1]], ptr [[DST_1_PTR]], align 4 +; CHECK-NEXT: [[SRC_2_PTR:%.*]] = getelementptr i8, ptr [[SRC_IV]], i64 8 +; CHECK-NEXT: [[S2:%.*]] = load float, ptr [[SRC_2_PTR]], align 4 +; CHECK-NEXT: [[DST_2_PTR:%.*]] = getelementptr i8, ptr [[DST_IV]], i64 8 +; CHECK-NEXT: store float [[S2]], ptr [[DST_2_PTR]], align 4 +; CHECK-NEXT: [[SRC_3_PTR:%.*]] = getelementptr i8, ptr [[SRC_IV]], i64 12 +; CHECK-NEXT: [[S3:%.*]] = load float, ptr [[SRC_3_PTR]], align 4 +; CHECK-NEXT: [[DST_3_PTR:%.*]] = getelementptr i8, ptr [[DST_IV]], i64 12 +; CHECK-NEXT: store float [[S3]], ptr [[DST_3_PTR]], align 4 +; CHECK-NEXT: br label %[[LOOP_LATCH]] +; CHECK: [[LOOP_LATCH]]: +; CHECK-NEXT: [[IV_NEXT]] = add i32 [[IV]], 1 +; CHECK-NEXT: [[SRC_IV_NEXT]] = getelementptr i8, ptr [[SRC_IV]], i64 16 +; CHECK-NEXT: [[DST_IV_NEXT]] = getelementptr i8, ptr [[DST_IV]], i64 16 +; CHECK-NEXT: [[EC:%.*]] = icmp eq i32 [[IV]], [[N]] +; CHECK-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP_HEADER]], !llvm.loop [[LOOP17:![0-9]+]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void ; entry: br label %loop.header diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll index 7bc606f5c61b3..0cfc14a0ae3a8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll @@ -23,23 +23,27 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP4]], splat (i16 2) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP6]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], splat (i16 2) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] @@ -99,23 +103,27 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur ; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[LEN]], -1 ; CHECK-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 ; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP3:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 16 +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 -; CHECK-NEXT: [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = add <16 x i16> [[TMP4]], splat (i16 2) -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] -; CHECK-NEXT: store <16 x i16> [[TMP5]], ptr [[TMP6]], align 2 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; CHECK-NEXT: [[TMP9:%.*]] = add [[TMP8]], splat (i16 2) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP10]], align 2 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]] +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll index fc0b19da47f4b..82f272ad853a8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll @@ -47,38 +47,37 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #1 { ; NARROW-NEXT: entry: ; NARROW-NEXT: br label [[VECTOR_PH:%.*]] ; NARROW: vector.ph: +; NARROW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; NARROW-NEXT: [[TMP1:%.*]] = mul nuw i64 [[TMP2]], 4 +; NARROW-NEXT: [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP1]] +; NARROW-NEXT: [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]] ; NARROW-NEXT: br label [[VECTOR_BODY:%.*]] ; NARROW: vector.body: ; NARROW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; NARROW-NEXT: [[TMP0:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]] -; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 8 -; NARROW-NEXT: [[TMP1:%.*]] = fptrunc <2 x double> [[WIDE_LOAD]] to <2 x float> -; NARROW-NEXT: [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0 -; NARROW-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP1]], i32 1 -; NARROW-NEXT: [[TMP3:%.*]] = call float @foo(float [[TMP2]]) #[[ATTR1:[0-9]+]] -; NARROW-NEXT: [[TMP5:%.*]] = call float @foo(float [[TMP4]]) #[[ATTR1]] -; NARROW-NEXT: [[TMP6:%.*]] = insertelement <2 x float> poison, float [[TMP3]], i32 0 -; NARROW-NEXT: [[TMP7:%.*]] = insertelement <2 x float> [[TMP6]], float [[TMP5]], i32 1 +; NARROW-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP0]], align 8 +; NARROW-NEXT: [[TMP3:%.*]] = fptrunc [[WIDE_LOAD]] to +; NARROW-NEXT: [[TMP4:%.*]] = call @foo_vector( [[TMP3]], splat (i1 true)) ; NARROW-NEXT: [[TMP8:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; NARROW-NEXT: store <2 x float> [[TMP7]], ptr [[TMP8]], align 4 -; NARROW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; NARROW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; NARROW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NARROW-NEXT: store [[TMP4]], ptr [[TMP8]], align 4 +; NARROW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; NARROW-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; NARROW-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; NARROW: middle.block: -; NARROW-NEXT: br label [[SCALAR_PH:%.*]] +; NARROW-NEXT: br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH:%.*]] ; NARROW: scalar.ph: ; NARROW-NEXT: br label [[FOR_BODY:%.*]] ; NARROW: for.body: -; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 1024, [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; NARROW-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[N_VEC]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; NARROW-NEXT: [[GEP:%.*]] = getelementptr double, ptr [[B]], i64 [[INDVARS_IV]] ; NARROW-NEXT: [[LOAD:%.*]] = load double, ptr [[GEP]], align 8 ; NARROW-NEXT: [[TRUNC:%.*]] = fptrunc double [[LOAD]] to float -; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR1]] +; NARROW-NEXT: [[CALL:%.*]] = call float @foo(float [[TRUNC]]) #[[ATTR2:[0-9]+]] ; NARROW-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDVARS_IV]] ; NARROW-NEXT: store float [[CALL]], ptr [[ARRAYIDX]], align 4 ; NARROW-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; NARROW-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025 -; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; NARROW-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; NARROW: for.cond.cleanup: ; NARROW-NEXT: ret void ; From 5ebdc01469b6a1723e3768560f3af23ebfa969e1 Mon Sep 17 00:00:00 2001 From: Sam Tebbs Date: Mon, 17 Nov 2025 15:32:14 +0000 Subject: [PATCH 2/4] Add shouldMaximizeScalableVectorBandwidth to AArch64Subtarget --- llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 9 ++++++- llvm/lib/Target/AArch64/AArch64Subtarget.h | 4 +++ .../AArch64/AArch64TargetTransformInfo.cpp | 10 +++---- .../AArch64/scalable-vectorization.ll | 26 +++++++++---------- 4 files changed, 28 insertions(+), 21 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 53b00e83a36b3..1f090ea802495 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -267,6 +267,13 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; break; + case NeoverseN2: + ShouldMaximizeScalableVectorBandwidth = false; + PrefFunctionAlignment = Align(16); + PrefLoopAlignment = Align(32); + MaxBytesForLoopAlignment = 16; + VScaleForTuning = 1; + break; case NeoverseV2: case NeoverseV3: CacheLineSize = 64; @@ -274,7 +281,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { MaxInterleaveFactor = 4; ScatterOverhead = 13; [[fallthrough]]; - case NeoverseN2: case NeoverseN3: PrefFunctionAlignment = Align(16); PrefLoopAlignment = Align(32); @@ -282,6 +288,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { VScaleForTuning = 1; break; case NeoverseV1: + ShouldMaximizeScalableVectorBandwidth = false; PrefFunctionAlignment = Align(16); PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 8974965c41fe3..33ef6ebd4c4fe 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -57,6 +57,7 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { #include "AArch64GenSubtargetInfo.inc" unsigned EpilogueVectorizationMinVF = 16; + bool ShouldMaximizeScalableVectorBandwidth = true; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 2; uint16_t CacheLineSize = 0; @@ -251,6 +252,9 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned getEpilogueVectorizationMinVF() const { return EpilogueVectorizationMinVF; } + bool shouldMaximizeScalableVectorBandwidth() const { + return ShouldMaximizeScalableVectorBandwidth; + } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const; unsigned getCacheLineSize() const override { return CacheLineSize; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4a606f242db01..3d2a57806ad13 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -379,13 +379,9 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( if (K == TargetTransformInfo::RGK_FixedWidthVector && ST->isNeonAvailable()) return true; - switch (ST->getProcFamily()) { - case AArch64Subtarget::NeoverseN2: - return false; - default: - return K == TargetTransformInfo::RGK_ScalableVector && - ST->isSVEorStreamingSVEAvailable(); - } + return K == TargetTransformInfo::RGK_ScalableVector && + ST->isSVEorStreamingSVEAvailable() && + ST->shouldMaximizeScalableVectorBandwidth(); } /// Calculate the cost of materializing a 64-bit value. This helper diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll index dcb2b9b08d1e9..1d215118449aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -1,19 +1,19 @@ ; REQUIRES: asserts ; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize --disable-output -scalable-vectorization=off < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_DISABLED ; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize --disable-output -scalable-vectorization=on < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON -; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize --disable-output -vectorizer-maximize-bandwidth -scalable-vectorization=on < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW +; RUN: opt -mtriple=aarch64-none-linux-gnu -mattr=+sve -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize --disable-output -vectorizer-maximize-bandwidth=false -scalable-vectorization=on < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_NOMAXBW ; Test that the MaxVF for the following loop, that has no dependence distances, ; is calculated as vscale x 4 (max legal SVE vector size) or vscale x 16 ; (maximized bandwidth for i8 in the loop). define void @test0(ptr %a, ptr %b, ptr %c) #0 { ; CHECK: LV: Checking a loop in 'test0' -; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON: LV: Selecting VF: 16 +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 -; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Selecting VF: vscale x 4 entry: br label %loop @@ -43,8 +43,8 @@ define void @test1(ptr %a, ptr %b) #0 { ; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 -; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Selecting VF: vscale x 4 entry: br label %loop @@ -75,8 +75,8 @@ define void @test2(ptr %a, ptr %b) #0 { ; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 -; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Selecting VF: 4 entry: br label %loop @@ -107,8 +107,8 @@ define void @test3(ptr %a, ptr %b) #0 { ; CHECK_SCALABLE_ON: LV: Selecting VF: 16 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16 -; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 -; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON_NOMAXBW: LV: Selecting VF: 4 entry: br label %loop @@ -140,8 +140,8 @@ define void @test4(ptr %a, ptr %b) #0 { ; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 -; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF -; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_NOMAXBW-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON_NOMAXBW: LV: Selecting VF: 4 entry: br label %loop From 87008e3d955122cf4c92259878da591eeff89bad Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Thu, 20 Nov 2025 11:13:03 +0000 Subject: [PATCH 3/4] Use subtarget feature --- llvm/lib/Target/AArch64/AArch64Features.td | 5 +++++ llvm/lib/Target/AArch64/AArch64Processors.td | 2 ++ llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 9 +-------- llvm/lib/Target/AArch64/AArch64Subtarget.h | 4 ---- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 2 +- 5 files changed, 9 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td index 72ff8613f01e7..066724bea92c9 100644 --- a/llvm/lib/Target/AArch64/AArch64Features.td +++ b/llvm/lib/Target/AArch64/AArch64Features.td @@ -894,6 +894,11 @@ def FeatureUseFixedOverScalableIfEqualCost : SubtargetFeature<"use-fixed-over-sc "UseFixedOverScalableIfEqualCost", "true", "Prefer fixed width loop vectorization over scalable if the cost-model assigns equal costs">; +def FeatureDisableMaximizeScalableBandwidth : SubtargetFeature< "disable-maximize-scalable-bandwidth", + "DisableMaximizeScalableBandwidth", "true", + "Determine the maximum scalable vector length for a loop by the " + "largest scalar type rather than the smallest">; + // For performance reasons we prefer to use ldapr to ldapur on certain cores. def FeatureAvoidLDAPUR : SubtargetFeature<"avoid-ldapur", "AvoidLDAPUR", "true", "Prefer add+ldapr to offset ldapur">; diff --git a/llvm/lib/Target/AArch64/AArch64Processors.td b/llvm/lib/Target/AArch64/AArch64Processors.td index 11387bb97d29c..120415f91c9ae 100644 --- a/llvm/lib/Target/AArch64/AArch64Processors.td +++ b/llvm/lib/Target/AArch64/AArch64Processors.td @@ -593,6 +593,7 @@ def TuneNeoverseN2 : SubtargetFeature<"neoversen2", "ARMProcFamily", "NeoverseN2 FeatureALULSLFast, FeaturePostRAScheduler, FeatureEnableSelectOptimize, + FeatureDisableMaximizeScalableBandwidth, FeaturePredictableSelectIsExpensive]>; def TuneNeoverseN3 : SubtargetFeature<"neoversen3", "ARMProcFamily", "NeoverseN3", @@ -626,6 +627,7 @@ def TuneNeoverseV1 : SubtargetFeature<"neoversev1", "ARMProcFamily", "NeoverseV1 FeaturePostRAScheduler, FeatureEnableSelectOptimize, FeaturePredictableSelectIsExpensive, + FeatureDisableMaximizeScalableBandwidth, FeatureNoSVEFPLD1R]>; def TuneNeoverseV2 : SubtargetFeature<"neoversev2", "ARMProcFamily", "NeoverseV2", diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp index 1f090ea802495..53b00e83a36b3 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -267,13 +267,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; break; - case NeoverseN2: - ShouldMaximizeScalableVectorBandwidth = false; - PrefFunctionAlignment = Align(16); - PrefLoopAlignment = Align(32); - MaxBytesForLoopAlignment = 16; - VScaleForTuning = 1; - break; case NeoverseV2: case NeoverseV3: CacheLineSize = 64; @@ -281,6 +274,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { MaxInterleaveFactor = 4; ScatterOverhead = 13; [[fallthrough]]; + case NeoverseN2: case NeoverseN3: PrefFunctionAlignment = Align(16); PrefLoopAlignment = Align(32); @@ -288,7 +282,6 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) { VScaleForTuning = 1; break; case NeoverseV1: - ShouldMaximizeScalableVectorBandwidth = false; PrefFunctionAlignment = Align(16); PrefLoopAlignment = Align(32); MaxBytesForLoopAlignment = 16; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index 33ef6ebd4c4fe..8974965c41fe3 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -57,7 +57,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { #include "AArch64GenSubtargetInfo.inc" unsigned EpilogueVectorizationMinVF = 16; - bool ShouldMaximizeScalableVectorBandwidth = true; uint8_t MaxInterleaveFactor = 2; uint8_t VectorInsertExtractBaseCost = 2; uint16_t CacheLineSize = 0; @@ -252,9 +251,6 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { unsigned getEpilogueVectorizationMinVF() const { return EpilogueVectorizationMinVF; } - bool shouldMaximizeScalableVectorBandwidth() const { - return ShouldMaximizeScalableVectorBandwidth; - } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const; unsigned getCacheLineSize() const override { return CacheLineSize; } diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 3d2a57806ad13..edd61a2db705e 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -381,7 +381,7 @@ bool AArch64TTIImpl::shouldMaximizeVectorBandwidth( return K == TargetTransformInfo::RGK_ScalableVector && ST->isSVEorStreamingSVEAvailable() && - ST->shouldMaximizeScalableVectorBandwidth(); + !ST->disableMaximizeScalableBandwidth(); } /// Calculate the cost of materializing a 64-bit value. This helper From a114020596e06fe3d8663b31126c6efc0a3f13d5 Mon Sep 17 00:00:00 2001 From: Samuel Tebbs Date: Wed, 26 Nov 2025 15:38:10 +0000 Subject: [PATCH 4/4] Rebase --- .../AArch64/conditional-branches-cost.ll | 58 ++++--------------- .../AArch64/partial-reduce-dot-product.ll | 40 ++++++------- .../LoopVectorize/AArch64/partial-reduce.ll | 14 ++--- .../LoopVectorize/AArch64/store-costs-sve.ll | 44 ++++++++------ 4 files changed, 64 insertions(+), 92 deletions(-) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll index 4f4f07e68ccfd..118f165050602 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll @@ -522,25 +522,23 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-LABEL: define void @multiple_exit_conditions( ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { ; DEFAULT-NEXT: [[ENTRY:.*:]] -; DEFAULT-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP6:%.*]] = shl nuw i64 [[TMP0]], 1 -; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP6]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]] -; DEFAULT: [[VECTOR_MAIN_LOOP_ITER_CHECK]]: ; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 257, [[TMP3]] -; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]] +; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]] ; DEFAULT: [[VECTOR_PH]]: ; DEFAULT-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 16 ; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP5]] ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]] +; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[N_VEC]], 8 +; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 2 ; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]] ; DEFAULT: [[VECTOR_BODY]]: ; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] -; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8 -; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]] +; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 8 +; DEFAULT-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] ; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2 ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[TMP1]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer @@ -548,14 +546,14 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp [[TMP8]] to ; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 2 -; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP11]] +; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP11]] ; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 3 -; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP14]] +; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP14]] ; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 12 -; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP17]] -; DEFAULT-NEXT: store [[TMP9]], ptr [[NEXT_GEP]], align 8 +; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP1]], i64 [[TMP17]] +; DEFAULT-NEXT: store [[TMP9]], ptr [[NEXT_GEP1]], align 8 ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP12]], align 8 ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP15]], align 8 ; DEFAULT-NEXT: store [[TMP9]], ptr [[TMP18]], align 8 @@ -564,40 +562,8 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 { ; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; DEFAULT: [[MIDDLE_BLOCK]]: ; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]] -; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]] -; DEFAULT: [[VEC_EPILOG_ITER_CHECK]]: -; DEFAULT-NEXT: [[TMP20:%.*]] = mul i64 [[N_VEC]], 8 -; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP20]] -; DEFAULT-NEXT: [[IND_END11:%.*]] = mul i64 [[N_VEC]], 2 -; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP6]] -; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF24:![0-9]+]] -; DEFAULT: [[VEC_EPILOG_PH]]: -; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ] -; DEFAULT-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP22:%.*]] = mul nuw i64 [[TMP21]], 2 -; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 257, [[TMP22]] -; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 257, [[N_MOD_VF2]] -; DEFAULT-NEXT: [[TMP23:%.*]] = mul i64 [[N_VEC3]], 8 -; DEFAULT-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP23]] -; DEFAULT-NEXT: [[TMP25:%.*]] = mul i64 [[N_VEC3]], 2 -; DEFAULT-NEXT: br label %[[VEC_EPILOG_VECTOR_BODY:.*]] -; DEFAULT: [[VEC_EPILOG_VECTOR_BODY]]: -; DEFAULT-NEXT: [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX4]], 8 -; DEFAULT-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX1]] -; DEFAULT-NEXT: [[TMP26:%.*]] = load i16, ptr [[SRC]], align 2 -; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement poison, i16 [[TMP26]], i64 0 -; DEFAULT-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector [[BROADCAST_SPLATINSERT6]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP27:%.*]] = or [[BROADCAST_SPLAT7]], splat (i16 1) -; DEFAULT-NEXT: [[TMP28:%.*]] = uitofp [[TMP27]] to -; DEFAULT-NEXT: store [[TMP28]], ptr [[NEXT_GEP5]], align 8 -; DEFAULT-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX4]], [[TMP22]] -; DEFAULT-NEXT: [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC3]] -; DEFAULT-NEXT: br i1 [[TMP29]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]] -; DEFAULT: [[VEC_EPILOG_MIDDLE_BLOCK]]: -; DEFAULT-NEXT: [[CMP_N9:%.*]] = icmp eq i64 257, [[N_VEC3]] -; DEFAULT-NEXT: br i1 [[CMP_N9]], [[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]] -; DEFAULT: [[VEC_EPILOG_SCALAR_PH]]: +; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]] +; DEFAULT: [[SCALAR_PH]]: ; ; PRED-LABEL: define void @multiple_exit_conditions( ; PRED-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] { diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll index 5380307215d3f..2bea0733f65b0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll @@ -174,16 +174,16 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 32 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i64 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1 @@ -311,16 +311,16 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly % ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]] ; CHECK-INTERLEAVED-NEXT: [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2 ; CHECK-INTERLEAVED-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]] -; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24 +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2 -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i64 24 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2 @@ -537,7 +537,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[TMP30:%.*]] = add i64 [[INDEX]], 30 ; CHECK-INTERLEAVED-NEXT: [[TMP31:%.*]] = add i64 [[INDEX]], 31 ; CHECK-INTERLEAVED-NEXT: [[TMP32:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP0]] -; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP34:%.*]] = getelementptr i8, ptr [[TMP32]], i64 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP34]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP35:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> @@ -801,13 +801,13 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED: vector.body: ; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32> ; CHECK-INTERLEAVED-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[TMP17]], i64 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP17]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32> @@ -1330,10 +1330,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVE1-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVE1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVE1-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVE1-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-INTERLEAVE1-NEXT: [[TMP10:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVE1-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVE1-NEXT: [[TMP15:%.*]] = mul nsw [[TMP10]], [[TMP13]] ; CHECK-INTERLEAVE1-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer ; CHECK-INTERLEAVE1-NEXT: [[TMP19]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) @@ -1368,10 +1368,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-INTERLEAVED-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-INTERLEAVED-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-INTERLEAVED-NEXT: [[TMP15:%.*]] = mul nsw [[TMP10]], [[TMP13]] ; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP15]], zeroinitializer ; CHECK-INTERLEAVED-NEXT: [[TMP19]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP12]]) @@ -1406,10 +1406,10 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-MAXBW-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP11]], [[ACTIVE_LANE_MASK]], poison) -; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]] ; CHECK-MAXBW-NEXT: [[WIDE_MASKED_LOAD1:%.*]] = call @llvm.masked.load.nxv16i8.p0(ptr align 1 [[TMP14]], [[ACTIVE_LANE_MASK]], poison) ; CHECK-MAXBW-NEXT: [[TMP16:%.*]] = sext [[WIDE_MASKED_LOAD1]] to +; CHECK-MAXBW-NEXT: [[TMP13:%.*]] = sext [[WIDE_MASKED_LOAD]] to ; CHECK-MAXBW-NEXT: [[TMP17:%.*]] = mul nsw [[TMP16]], [[TMP13]] ; CHECK-MAXBW-NEXT: [[TMP18:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP17]], zeroinitializer ; CHECK-MAXBW-NEXT: [[PARTIAL_REDUCE]] = call @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32( [[VEC_PHI]], [[TMP18]]) @@ -1953,7 +1953,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> @@ -2082,7 +2082,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr inbounds nuw i16, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i32 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds nuw i16, ptr [[TMP1]], i64 8 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2 ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll index 6e6137957f2f5..3d2832eb366ad 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll @@ -146,7 +146,7 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 ; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32> @@ -449,9 +449,9 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16 -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i32 32 -; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i32 48 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 16 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32 +; CHECK-INTERLEAVED-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[TMP1]], i64 48 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1 @@ -1240,9 +1240,9 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 { ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE8:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE9:%.*]], [[VECTOR_BODY]] ] ; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR]], i64 [[INDEX]] -; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 -; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 -; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 4 +; CHECK-INTERLEAVED-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 8 +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 12 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll index 48297937ae79a..365ac6d27fcb0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll @@ -10,7 +10,7 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] { ; DEFAULT-NEXT: iter.check: ; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP10]], 3 +; DEFAULT-NEXT: [[TMP13:%.*]] = shl nuw i64 [[TMP10]], 2 ; DEFAULT-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP13]] ; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ; DEFAULT: vector.main.loop.iter.check: @@ -44,14 +44,14 @@ define void @cost_store_i8(ptr %dst) #0 { ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() -; DEFAULT-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8 +; DEFAULT-NEXT: [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 4 ; DEFAULT-NEXT: [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP12]] ; DEFAULT-NEXT: [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]] ; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; DEFAULT: vec.epilog.vector.body: ; DEFAULT-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]] -; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 +; DEFAULT-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 ; DEFAULT-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP12]] ; DEFAULT-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]] ; DEFAULT-NEXT: br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] @@ -139,15 +139,15 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 1000, [[N_MOD_VF]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; DEFAULT-NEXT: [[TMP13:%.*]] = trunc [[BROADCAST_SPLAT]] to -; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] -; DEFAULT: vector.body: -; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP5:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[TMP5]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; DEFAULT-NEXT: [[TMP6:%.*]] = trunc [[BROADCAST_SPLAT3]] to +; DEFAULT-NEXT: [[TMP13:%.*]] = trunc [[BROADCAST_SPLAT]] to ; DEFAULT-NEXT: [[TMP14:%.*]] = and [[TMP6]], [[TMP13]] +; DEFAULT-NEXT: br label [[VECTOR_BODY:%.*]] +; DEFAULT: vector.body: +; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]] ; DEFAULT-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; DEFAULT-NEXT: [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4 @@ -162,25 +162,25 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]] ; DEFAULT: vec.epilog.iter.check: ; DEFAULT-NEXT: [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8 -; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]] +; DEFAULT-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF12:![0-9]+]] ; DEFAULT: vec.epilog.ph: ; DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer -; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> -; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] -; DEFAULT: vec.epilog.vector.body: -; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META12:![0-9]+]] +; DEFAULT-NEXT: [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META13:![0-9]+]] ; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0 ; DEFAULT-NEXT: [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer ; DEFAULT-NEXT: [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8> +; DEFAULT-NEXT: [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8> ; DEFAULT-NEXT: [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]] +; DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] +; DEFAULT: vec.epilog.vector.body: +; DEFAULT-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]] -; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META15:![0-9]+]], !noalias [[META12]] +; DEFAULT-NEXT: store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META16:![0-9]+]], !noalias [[META13]] ; DEFAULT-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8 ; DEFAULT-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 1000 -; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; DEFAULT-NEXT: br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; DEFAULT: vec.epilog.middle.block: ; DEFAULT-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] ; DEFAULT: vec.epilog.scalar.ph: @@ -196,7 +196,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 { ; DEFAULT-NEXT: store i8 [[TRUNC]], ptr [[GEP]], align 1 ; DEFAULT-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1 ; DEFAULT-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1000 -; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP18:![0-9]+]] +; DEFAULT-NEXT: br i1 [[EC]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP19:![0-9]+]] ; DEFAULT: exit: ; DEFAULT-NEXT: ret void ; @@ -279,7 +279,7 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; DEFAULT: [[META1]] = !{!"llvm.loop.isvectorized", i32 1} ; DEFAULT: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"} -; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 8, i32 24} +; DEFAULT: [[PROF3]] = !{!"branch_weights", i32 4, i32 28} ; DEFAULT: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]} ; DEFAULT: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]} ; DEFAULT: [[META6]] = !{[[META7:![0-9]+]]} @@ -288,8 +288,14 @@ attributes #1 = { vscale_range(1,16) "target-features"="+sve" } ; DEFAULT: [[META9]] = !{[[META10:![0-9]+]]} ; DEFAULT: [[META10]] = distinct !{[[META10]], [[META8]]} ; DEFAULT: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]} -; DEFAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]]} +; DEFAULT: [[PROF12]] = !{!"branch_weights", i32 8, i32 24} +; DEFAULT: [[META13]] = !{[[META14:![0-9]+]]} +; DEFAULT: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} +; DEFAULT: [[META15]] = distinct !{[[META15]], !"LVerDomain"} +; DEFAULT: [[META16]] = !{[[META17:![0-9]+]]} +; DEFAULT: [[META17]] = distinct !{[[META17]], [[META15]]} +; DEFAULT: [[LOOP18]] = distinct !{[[LOOP18]], [[META1]], [[META2]]} +; DEFAULT: [[LOOP19]] = distinct !{[[LOOP19]], [[META1]]} ;. ; PRED: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]} ; PRED: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}