diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 744faef192438..ada2ad96bb9bf 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -3503,10 +3503,10 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( if (hasIrregularType(ScalarTy, DL)) return false; - // We currently only know how to emit interleave/deinterleave with - // Factor=2 for scalable vectors. This is purely an implementation - // limit. - if (VF.isScalable() && InterleaveFactor != 2) + // For scalable vectors, the only interleave factor currently supported + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + if (VF.isScalable() && !isPowerOf2_32(InterleaveFactor)) return false; // If the group involves a non-integral pointer, we may not be able to @@ -9410,9 +9410,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) { CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); // For scalable vectors, the only interleave factor currently supported - // is 2 since we require the (de)interleave2 intrinsics instead of - // shufflevectors. - assert((!Result || !VF.isScalable() || IG->getFactor() == 2) && + // must be power of 2 since we require the (de)interleave2 intrinsics + // instead of shufflevectors. + assert((!Result || !VF.isScalable() || isPowerOf2_32(IG->getFactor())) && "Unsupported interleave factor for scalable vectors"); return Result; }; diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 4057a51155ece..73d606d0f8303 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -2860,10 +2860,21 @@ static Value *interleaveVectors(IRBuilderBase &Builder, ArrayRef Vals, // Scalable vectors cannot use arbitrary shufflevectors (only splats), so // must use intrinsics to interleave. if (VecTy->isScalableTy()) { - VectorType *WideVecTy = VectorType::getDoubleElementsVectorType(VecTy); - return Builder.CreateIntrinsic(WideVecTy, Intrinsic::vector_interleave2, - Vals, - /*FMFSource=*/nullptr, Name); + assert(isPowerOf2_32(Factor) && "Unsupported interleave factor for " + "scalable vectors, must be power of 2"); + SmallVector InterleavingValues(Vals); + // When interleaving, the number of values will be shrunk until we have the + // single final interleaved value. + auto *InterleaveTy = cast(InterleavingValues[0]->getType()); + for (unsigned Midpoint = Factor / 2; Midpoint > 0; Midpoint /= 2) { + InterleaveTy = VectorType::getDoubleElementsVectorType(InterleaveTy); + for (unsigned I = 0; I < Midpoint; ++I) + InterleavingValues[I] = Builder.CreateIntrinsic( + InterleaveTy, Intrinsic::vector_interleave2, + {InterleavingValues[I], InterleavingValues[Midpoint + I]}, + /*FMFSource=*/nullptr, Name); + } + return InterleavingValues[0]; } // Fixed length. Start by concatenating all vectors into a wide vector. @@ -2949,15 +2960,11 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { &InterleaveFactor](Value *MaskForGaps) -> Value * { if (State.VF.isScalable()) { assert(!MaskForGaps && "Interleaved groups with gaps are not supported."); - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); auto *ResBlockInMask = State.get(BlockInMask); - SmallVector Ops = {ResBlockInMask, ResBlockInMask}; - auto *MaskTy = VectorType::get(State.Builder.getInt1Ty(), - State.VF.getKnownMinValue() * 2, true); - return State.Builder.CreateIntrinsic( - MaskTy, Intrinsic::vector_interleave2, Ops, - /*FMFSource=*/nullptr, "interleaved.mask"); + SmallVector Ops(InterleaveFactor, ResBlockInMask); + return interleaveVectors(State.Builder, Ops, "interleaved.mask"); } if (!BlockInMask) @@ -2997,22 +3004,48 @@ void VPInterleaveRecipe::execute(VPTransformState &State) { ArrayRef VPDefs = definedValues(); const DataLayout &DL = State.CFG.PrevBB->getDataLayout(); if (VecTy->isScalableTy()) { - assert(InterleaveFactor == 2 && + assert(isPowerOf2_32(InterleaveFactor) && "Unsupported deinterleave factor for scalable vectors"); - // Scalable vectors cannot use arbitrary shufflevectors (only splats), - // so must use intrinsics to deinterleave. - Value *DI = State.Builder.CreateIntrinsic( - Intrinsic::vector_deinterleave2, VecTy, NewLoad, - /*FMFSource=*/nullptr, "strided.vec"); - unsigned J = 0; - for (unsigned I = 0; I < InterleaveFactor; ++I) { - Instruction *Member = Group->getMember(I); + // Scalable vectors cannot use arbitrary shufflevectors (only splats), + // so must use intrinsics to deinterleave. + SmallVector DeinterleavedValues(InterleaveFactor); + DeinterleavedValues[0] = NewLoad; + // For the case of InterleaveFactor > 2, we will have to do recursive + // deinterleaving, because the current available deinterleave intrinsic + // supports only Factor of 2, otherwise it will bailout after first + // iteration. + // When deinterleaving, the number of values will double until we + // have "InterleaveFactor". + for (unsigned NumVectors = 1; NumVectors < InterleaveFactor; + NumVectors *= 2) { + // Deinterleave the elements within the vector + SmallVector TempDeinterleavedValues(NumVectors); + for (unsigned I = 0; I < NumVectors; ++I) { + auto *DiTy = DeinterleavedValues[I]->getType(); + TempDeinterleavedValues[I] = State.Builder.CreateIntrinsic( + Intrinsic::vector_deinterleave2, DiTy, DeinterleavedValues[I], + /*FMFSource=*/nullptr, "strided.vec"); + } + // Extract the deinterleaved values: + for (unsigned I = 0; I < 2; ++I) + for (unsigned J = 0; J < NumVectors; ++J) + DeinterleavedValues[NumVectors * I + J] = + State.Builder.CreateExtractValue(TempDeinterleavedValues[J], I); + } - if (!Member) +#ifndef NDEBUG + for (Value *Val : DeinterleavedValues) + assert(Val && "NULL Deinterleaved Value"); +#endif + for (unsigned I = 0, J = 0; I < InterleaveFactor; ++I) { + Instruction *Member = Group->getMember(I); + Value *StridedVec = DeinterleavedValues[I]; + if (!Member) { + // This value is not needed as it's not used + cast(StridedVec)->eraseFromParent(); continue; - - Value *StridedVec = State.Builder.CreateExtractValue(DI, I); + } // If this member has different type, cast the result type. if (Member->getType() != ScalarTy) { VectorType *OtherVTy = VectorType::get(Member->getType(), State.VF); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll index bf95622733461..05c0bc0761ea4 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll @@ -396,8 +396,8 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n ; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) ; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP10]]) ; CHECK-NEXT: [[REVERSE1:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP11]]) ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] ; CHECK-NEXT: [[TMP13:%.*]] = sub nsw [[REVERSE1]], [[VEC_IND]] @@ -1548,5 +1548,263 @@ end: ret void } +; Check vectorization on an interleaved load/store groups of factor 4 + +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } +%struct.xyzt = type { i32, i32, i32, i32 } + +define void @interleave_deinterleave(ptr writeonly noalias %dst, ptr readonly %a, ptr readonly %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 2 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i64 [[TMP1]], 1024 +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = shl i64 [[TMP2]], 2 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub nuw nsw i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 2 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP6]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP7]]) +; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 0 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC7]], 1 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_VEC8:%.*]] = load , ptr [[TMP13]], align 4 +; CHECK-NEXT: [[STRIDED_VEC9:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC8]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC9]], 1 +; CHECK-NEXT: [[STRIDED_VEC10:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP14]]) +; CHECK-NEXT: [[STRIDED_VEC11:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC10]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC11]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP20]], [[TMP23]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP22]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC13:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP41:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_MOD_VF]], 0 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP27]], [[TMP26]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds nuw [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP28]], [[TMP29]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP30]], [[TMP31]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP32]], [[TMP33]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP42:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds %struct.xyzt, ptr %a, i64 %indvars.iv + %0 = load i32, ptr %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds %struct.xyzt, ptr %b, i64 %indvars.iv + %1 = load i32, ptr %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %arrayidx5 = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %indvars.iv + store i32 %add, ptr %arrayidx5, align 4 + %y = getelementptr inbounds nuw i8, ptr %arrayidx, i64 4 + %2 = load i32, ptr %y, align 4 + %y11 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 4 + %3 = load i32, ptr %y11, align 4 + %sub = sub nsw i32 %2, %3 + %y14 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 4 + store i32 %sub, ptr %y14, align 4 + %z = getelementptr inbounds nuw i8, ptr %arrayidx, i64 8 + %4 = load i32, ptr %z, align 4 + %z19 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 8 + %5 = load i32, ptr %z19, align 4 + %shl = shl i32 %4, %5 + %z22 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 8 + store i32 %shl, ptr %z22, align 4 + %t = getelementptr inbounds nuw i8, ptr %arrayidx, i64 12 + %6 = load i32, ptr %t, align 4 + %t27 = getelementptr inbounds nuw i8, ptr %arrayidx2, i64 12 + %7 = load i32, ptr %t27, align 4 + %shr = ashr i32 %6, %7 + %t30 = getelementptr inbounds nuw i8, ptr %arrayidx5, i64 12 + store i32 %shr, ptr %t30, align 4 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +} + +; Check vectorization on a reverse interleaved load/store groups of factor 4 + +; for (int i = 1023; i >= 0; i--) { +; int a = A[i].x + i; +; int b = A[i].y - i; +; int c = A[i].z * i; +; int d = A[i].t << i; +; B[i].x = a; +; B[i].y = b; +; B[i].z = c; +; B[i].t = d; +; } + +define void @interleave_deinterleave_reverse(ptr noalias nocapture readonly %A, ptr noalias nocapture %B) #1{ +; CHECK-LABEL: @interleave_deinterleave_reverse( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.stepvector.nxv4i32() +; CHECK-NEXT: [[INDUCTION:%.*]] = sub splat (i32 1023), [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = trunc nuw nsw i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP4:%.*]] = sub nsw i32 0, [[TMP3]] +; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP4]], i64 0 +; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i32 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = sub nsw i32 4, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP10]], align 4 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv16i32( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP11]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[TMP12]]) +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[REVERSE:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP13]]) +; CHECK-NEXT: [[REVERSE3:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP14]]) +; CHECK-NEXT: [[REVERSE4:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP15]]) +; CHECK-NEXT: [[REVERSE5:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = add nsw [[REVERSE]], [[VEC_IND]] +; CHECK-NEXT: [[TMP18:%.*]] = sub nsw [[REVERSE3]], [[VEC_IND]] +; CHECK-NEXT: [[TMP19:%.*]] = mul nsw [[REVERSE4]], [[VEC_IND]] +; CHECK-NEXT: [[TMP20:%.*]] = shl nuw nsw [[REVERSE5]], [[VEC_IND]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 0 +; CHECK-NEXT: [[TMP22:%.*]] = call i32 @llvm.vscale.i32() +; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i32 [[TMP22]], 4 +; CHECK-NEXT: [[TMP24:%.*]] = sub nsw i32 4, [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = sext i32 [[TMP24]] to i64 +; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP25]] +; CHECK-NEXT: [[REVERSE6:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP17]]) +; CHECK-NEXT: [[REVERSE7:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP18]]) +; CHECK-NEXT: [[REVERSE8:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP19]]) +; CHECK-NEXT: [[REVERSE9:%.*]] = call @llvm.vector.reverse.nxv4i32( [[TMP20]]) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE6]], [[REVERSE8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[REVERSE7]], [[REVERSE9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv16i32( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC10]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC11]], ptr [[TMP26]], align 4 +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]] +; CHECK-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP43:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: br i1 poison, label [[FOR_BODY]], label [[FOR_COND_CLEANUP]], !llvm.loop [[LOOP44:![0-9]+]] +; +entry: + br label %for.body +for.cond.cleanup: ; preds = %for.body + ret void +for.body: ; preds = %for.body, %entry + %indvars.iv = phi i64 [ 1023, %entry ], [ %indvars.iv.next, %for.body ] + %x = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 0 + %load1 = load i32, ptr %x, align 4 + %trunc = trunc i64 %indvars.iv to i32 + %add = add nsw i32 %load1, %trunc + %y = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 1 + %load2 = load i32, ptr %y, align 4 + %sub = sub nsw i32 %load2, %trunc + %z = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 2 + %load3 = load i32, ptr %z, align 4 + %mul = mul nsw i32 %load3, %trunc + %t = getelementptr inbounds %struct.xyzt, ptr %A, i64 %indvars.iv, i32 3 + %load4 = load i32, ptr %t, align 4 + %shl = shl nuw nsw i32 %load4, %trunc + %x5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 0 + store i32 %add, ptr %x5, align 4 + %y8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 1 + store i32 %sub, ptr %y8, align 4 + %z5 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 2 + store i32 %mul, ptr %z5, align 4 + %t8 = getelementptr inbounds %struct.xyzt, ptr %B, i64 %indvars.iv, i32 3 + store i32 %shl, ptr %t8, align 4 + %indvars.iv.next = add nsw i64 %indvars.iv, -1 + %cmp = icmp sgt i64 %indvars.iv, 0 + br i1 %cmp, label %for.body, label %for.cond.cleanup + +} attributes #1 = { "target-features"="+sve" vscale_range(1, 16) } attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll index 1a281fe7c6f7f..d4392bebdf37b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll @@ -529,3 +529,255 @@ for.inc: for.end: ret void } + +; Expected to contain interleave2/deinterleave2 instructions +; +; void masked_strided_factor4(const unsigned char* restrict p, +; unsigned char* restrict q, +; unsigned char guard) { +; for(ix=0; ix < 1024; ++ix) { +; if (ix > guard) { +; char left1 = p[4*ix]; +; char right1 = p[4*ix + 1]; +; char left2 = p[4*ix + 2]; +; char right2 = p[4*ix + 3]; +; char max1 = max(left1, right1); +; char max2 = max(left2, right2); +; q[4*ix] = max1; +; q[4*ix + 1] = 0 - max1; +; q[4*ix + 2] = max2; +; q[4*ix + 3] = 0 - max2; +; } +; } +;} +define dso_local void @masked_strided_factor4(ptr noalias nocapture readonly %p, ptr noalias nocapture %q, i8 zeroext %guard) local_unnamed_addr #0 { +; SCALAR_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; SCALAR_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; SCALAR_TAIL_FOLDING-NEXT: entry: +; SCALAR_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ugt i32 [[TMP1]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALAR_TAIL_FOLDING: vector.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]] +; SCALAR_TAIL_FOLDING-NEXT: [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.vscale.i32() +; SCALAR_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = shl i32 [[TMP4]], 4 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = call @llvm.stepvector.nxv16i32() +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP5]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; SCALAR_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; SCALAR_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: vector.body: +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; SCALAR_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; SCALAR_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; SCALAR_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; SCALAR_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; SCALAR_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]] +; SCALAR_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALAR_TAIL_FOLDING: middle.block: +; SCALAR_TAIL_FOLDING-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] +; SCALAR_TAIL_FOLDING: scalar.ph: +; SCALAR_TAIL_FOLDING-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; SCALAR_TAIL_FOLDING: for.body: +; SCALAR_TAIL_FOLDING-NEXT: [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ] +; SCALAR_TAIL_FOLDING-NEXT: [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]] +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: if.then: +; SCALAR_TAIL_FOLDING-NEXT: [[IDX0:%.*]] = shl nuw nsw i32 [[IX_024]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX1:%.*]] = or disjoint i32 [[IDX0]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX2:%.*]] = or disjoint i32 [[IDX0]], 2 +; SCALAR_TAIL_FOLDING-NEXT: [[IDX3:%.*]] = or disjoint i32 [[IDX0]], 3 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP24:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP24]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP25:%.*]] = load i8, ptr [[ARRAY1IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP26:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP26]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP27:%.*]] = load i8, ptr [[ARRAY1IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP28:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP28]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP29:%.*]] = load i8, ptr [[ARRAY1IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP30:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY1IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 [[TMP30]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP31:%.*]] = load i8, ptr [[ARRAY1IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I1:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP25]], i8 [[TMP27]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB1:%.*]] = sub i8 0, [[SPEC_SELECT_I1]] +; SCALAR_TAIL_FOLDING-NEXT: [[SPEC_SELECT_I2:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP29]], i8 [[TMP31]]) +; SCALAR_TAIL_FOLDING-NEXT: [[SUB2:%.*]] = sub i8 0, [[SPEC_SELECT_I2]] +; SCALAR_TAIL_FOLDING-NEXT: [[TMP32:%.*]] = zext nneg i32 [[IDX0]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX0:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP32]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I1]], ptr [[ARRAY3IDX0]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP33:%.*]] = zext nneg i32 [[IDX1]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP33]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB1]], ptr [[ARRAY3IDX1]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP34:%.*]] = zext nneg i32 [[IDX2]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX2:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP34]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SPEC_SELECT_I2]], ptr [[ARRAY3IDX2]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: [[TMP35:%.*]] = zext nneg i32 [[IDX3]] to i64 +; SCALAR_TAIL_FOLDING-NEXT: [[ARRAY3IDX3:%.*]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 [[TMP35]] +; SCALAR_TAIL_FOLDING-NEXT: store i8 [[SUB2]], ptr [[ARRAY3IDX3]], align 1 +; SCALAR_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; SCALAR_TAIL_FOLDING: for.inc: +; SCALAR_TAIL_FOLDING-NEXT: [[INC]] = add nuw nsw i32 [[IX_024]], 1 +; SCALAR_TAIL_FOLDING-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1024 +; SCALAR_TAIL_FOLDING-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALAR_TAIL_FOLDING: for.end: +; SCALAR_TAIL_FOLDING-NEXT: ret void +; +; PREDICATED_TAIL_FOLDING-LABEL: define dso_local void @masked_strided_factor4 +; PREDICATED_TAIL_FOLDING-SAME: (ptr noalias nocapture readonly [[P:%.*]], ptr noalias nocapture [[Q:%.*]], i8 zeroext [[GUARD:%.*]]) local_unnamed_addr #[[ATTR0]] { +; PREDICATED_TAIL_FOLDING-NEXT: entry: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; PREDICATED_TAIL_FOLDING: vector.ph: +; PREDICATED_TAIL_FOLDING-NEXT: [[CONV:%.*]] = zext i8 [[GUARD]] to i32 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP0:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP1:%.*]] = shl i32 [[TMP0]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP2:%.*]] = call i32 @llvm.vscale.i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP3:%.*]] = shl i32 [[TMP2]], 4 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP5:%.*]] = call @llvm.stepvector.nxv16i32() +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i32 [[TMP1]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[CONV]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: br label [[VECTOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: vector.body: +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP6:%.*]] = icmp ugt [[VEC_IND]], [[BROADCAST_SPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP7:%.*]] = select [[ACTIVE_LANE_MASK]], [[TMP6]], zeroinitializer +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP8:%.*]] = shl i32 [[INDEX]], 2 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP9:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK1:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK2:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK]], [[INTERLEAVED_MASK1]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[WIDE_MASKED_VEC:%.*]] = call @llvm.masked.load.nxv64i8.p0(ptr [[TMP10]], i32 1, [[INTERLEAVED_MASK2]], poison) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv64i8( [[WIDE_MASKED_VEC]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP11]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv32i8( [[TMP12]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP17:%.*]] = call @llvm.smax.nxv16i8( [[TMP13]], [[TMP14]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP18:%.*]] = sub zeroinitializer, [[TMP17]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP19:%.*]] = call @llvm.smax.nxv16i8( [[TMP15]], [[TMP16]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP20:%.*]] = sub zeroinitializer, [[TMP19]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP21:%.*]] = sext i32 [[TMP8]] to i64 +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP22:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP21]] +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP17]], [[TMP19]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC5:%.*]] = call @llvm.vector.interleave2.nxv32i8( [[TMP18]], [[TMP20]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_VEC6:%.*]] = call @llvm.vector.interleave2.nxv64i8( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC5]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK7:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK8:%.*]] = call @llvm.vector.interleave2.nxv32i1( [[TMP7]], [[TMP7]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INTERLEAVED_MASK9:%.*]] = call @llvm.vector.interleave2.nxv64i1( [[INTERLEAVED_MASK7]], [[INTERLEAVED_MASK8]]) +; PREDICATED_TAIL_FOLDING-NEXT: call void @llvm.masked.store.nxv64i8.p0( [[INTERLEAVED_VEC6]], ptr [[TMP22]], i32 1, [[INTERLEAVED_MASK9]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]] +; PREDICATED_TAIL_FOLDING-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]]) +; PREDICATED_TAIL_FOLDING-NEXT: [[VEC_IND_NEXT]] = add [[VEC_IND]], [[DOTSPLAT]] +; PREDICATED_TAIL_FOLDING-NEXT: [[TMP23:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; PREDICATED_TAIL_FOLDING-NEXT: br i1 [[TMP23]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP8:![0-9]+]] +; PREDICATED_TAIL_FOLDING: middle.block: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; PREDICATED_TAIL_FOLDING: scalar.ph: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_BODY:%.*]] +; PREDICATED_TAIL_FOLDING: for.body: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[IF_THEN:%.*]], label [[FOR_INC:%.*]] +; PREDICATED_TAIL_FOLDING: if.then: +; PREDICATED_TAIL_FOLDING-NEXT: br label [[FOR_INC]] +; PREDICATED_TAIL_FOLDING: for.inc: +; PREDICATED_TAIL_FOLDING-NEXT: br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; PREDICATED_TAIL_FOLDING: for.end: +; PREDICATED_TAIL_FOLDING-NEXT: ret void +; +entry: + %conv = zext i8 %guard to i32 + br label %for.body + +for.body: + %ix.024 = phi i32 [ 0, %entry ], [ %inc, %for.inc ] + %cmp1 = icmp ugt i32 %ix.024, %conv + br i1 %cmp1, label %if.then, label %for.inc + +if.then: + %idx0 = shl nuw nsw i32 %ix.024, 2 + %idx1 = add i32 %idx0, 1 + %idx2 = add i32 %idx0, 2 + %idx3 = add i32 %idx0, 3 + + %array1idx0 = getelementptr inbounds i8, ptr %p, i32 %idx0 + %0 = load i8, ptr %array1idx0, align 1 + %array1idx1 = getelementptr inbounds i8, ptr %p, i32 %idx1 + %1 = load i8, ptr %array1idx1, align 1 + %array1idx2 = getelementptr inbounds i8, ptr %p, i32 %idx2 + %2 = load i8, ptr %array1idx2, align 1 + %array1idx3 = getelementptr inbounds i8, ptr %p, i32 %idx3 + %3 = load i8, ptr %array1idx3, align 1 + + %cmp.i1 = icmp slt i8 %0, %1 + %spec.select.i1 = select i1 %cmp.i1, i8 %1, i8 %0 + %sub1 = sub i8 0, %spec.select.i1 + %cmp.i2 = icmp slt i8 %2, %3 + %spec.select.i2 = select i1 %cmp.i2, i8 %3, i8 %2 + %sub2 = sub i8 0, %spec.select.i2 + + %array3idx0 = getelementptr inbounds i8, ptr %q, i32 %idx0 + store i8 %spec.select.i1, ptr %array3idx0, align 1 + %array3idx1 = getelementptr inbounds i8, ptr %q, i32 %idx1 + store i8 %sub1, ptr %array3idx1, align 1 + %array3idx2 = getelementptr inbounds i8, ptr %q, i32 %idx2 + store i8 %spec.select.i2, ptr %array3idx2, align 1 + %array3idx3 = getelementptr inbounds i8, ptr %q, i32 %idx3 + store i8 %sub2, ptr %array3idx3, align 1 + + br label %for.inc + +for.inc: + %inc = add nuw nsw i32 %ix.024, 1 + %exitcond = icmp eq i32 %inc, 1024 + br i1 %exitcond, label %for.end, label %for.body + +for.end: + ret void +} diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index bda4839dead51..b1ff589fe51bf 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -9,7 +9,7 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -17,88 +17,88 @@ define void @load_store_factor2_i32(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; CHECK-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP7]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP9]], <16 x i32> poison, <16 x i32> -; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> +; FIXED-NEXT: store <16 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; FIXED-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -107,7 +107,7 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -115,44 +115,44 @@ define void @load_store_factor2_i32(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i32 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i32 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i32 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv8i32( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP0:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 ; SCALABLE-NEXT: store i32 [[Y1]], ptr [[Q1]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP3:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -186,7 +186,7 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -194,88 +194,88 @@ define void @load_store_factor2_i64(ptr %p) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; CHECK-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; CHECK-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 1 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP7]], <8 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP9]], <8 x i64> poison, <8 x i32> -; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> +; FIXED-NEXT: store <8 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP6:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; FIXED-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -284,7 +284,7 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -292,44 +292,44 @@ define void @load_store_factor2_i64(ptr %p) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 1) -; SCALABLE-NEXT: [[TMP15:%.*]] = add [[TMP11]], splat (i64 2) -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP12]], [[TMP15]]) -; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], splat (i64 1) +; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], splat (i64 2) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[TMP11]], [[TMP12]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP13:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP4:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 ; SCALABLE-NEXT: store i64 [[Y1]], ptr [[Q1]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP5:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP5:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -360,42 +360,42 @@ exit: define void @load_store_factor3_i32(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; CHECK-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; CHECK-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; CHECK-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -405,50 +405,50 @@ define void @load_store_factor3_i32(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; CHECK-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; CHECK-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; FIXED-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; FIXED-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; FIXED-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -458,50 +458,50 @@ define void @load_store_factor3_i32(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; FIXED-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; FIXED-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i32( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[TMP2]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <24 x i32>, ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <24 x i32> [[WIDE_VEC]], <24 x i32> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP6]], <16 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <8 x i32> [[TMP9]], <8 x i32> poison, <16 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP11]], <16 x i32> [[TMP12]], <24 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP13]], <24 x i32> poison, <24 x i32> -; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <8 x i32> [[STRIDED_VEC]], splat (i32 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <8 x i32> [[STRIDED_VEC1]], splat (i32 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <8 x i32> [[STRIDED_VEC2]], splat (i32 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <16 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <16 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP6]], <16 x i32> [[TMP7]], <24 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <24 x i32> [[TMP8]], <24 x i32> poison, <24 x i32> +; SCALABLE-NEXT: store <24 x i32> [[INTERLEAVED_VEC]], ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP6:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q3]], align 4 ; SCALABLE-NEXT: [[Y0:%.*]] = add i32 [[X0]], 1 -; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i32 [[Y0]], ptr [[Q3]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[Y1:%.*]] = add i32 [[X1]], 2 @@ -511,9 +511,9 @@ define void @load_store_factor3_i32(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i32, ptr [[Q2]], align 4 ; SCALABLE-NEXT: [[Y2:%.*]] = add i32 [[X2]], 3 ; SCALABLE-NEXT: store i32 [[Y2]], ptr [[Q2]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP7:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP7:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -550,42 +550,42 @@ exit: define void @load_store_factor3_i64(ptr %p) { ; CHECK-LABEL: @load_store_factor3_i64( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; CHECK-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; CHECK-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -595,50 +595,50 @@ define void @load_store_factor3_i64(ptr %p) { ; CHECK-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; CHECK-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; CHECK-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor3_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; FIXED-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; FIXED-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; FIXED-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; FIXED-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; FIXED-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -648,50 +648,50 @@ define void @load_store_factor3_i64(ptr %p) { ; FIXED-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; FIXED-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; FIXED-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor3_i64( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[TMP2]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[TMP0]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <12 x i64>, ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> ; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <12 x i64> [[WIDE_VEC]], <12 x i64> poison, <4 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP6]], <8 x i32> -; SCALABLE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <8 x i32> -; SCALABLE-NEXT: [[TMP13:%.*]] = shufflevector <8 x i64> [[TMP11]], <8 x i64> [[TMP12]], <12 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP13]], <12 x i64> poison, <12 x i32> -; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 -; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; SCALABLE-NEXT: [[TMP3:%.*]] = add <4 x i64> [[STRIDED_VEC]], splat (i64 1) +; SCALABLE-NEXT: [[TMP4:%.*]] = add <4 x i64> [[STRIDED_VEC1]], splat (i64 2) +; SCALABLE-NEXT: [[TMP5:%.*]] = add <4 x i64> [[STRIDED_VEC2]], splat (i64 3) +; SCALABLE-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP3]], <4 x i64> [[TMP4]], <8 x i32> +; SCALABLE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> +; SCALABLE-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <12 x i32> +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <12 x i64> [[TMP8]], <12 x i64> poison, <12 x i32> +; SCALABLE-NEXT: store <12 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], 4 +; SCALABLE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; SCALABLE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP8:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = mul i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET3:%.*]] = mul i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q3:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET3]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q3]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q3]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET3]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -701,9 +701,9 @@ define void @load_store_factor3_i64(ptr %p) { ; SCALABLE-NEXT: [[X2:%.*]] = load i64, ptr [[Q2]], align 8 ; SCALABLE-NEXT: [[Y2:%.*]] = add i64 [[X2]], 3 ; SCALABLE-NEXT: store i64 [[Y2]], ptr [[Q2]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP9:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP9:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -740,56 +740,75 @@ exit: define void @load_store_factor8(ptr %p) { ; CHECK-LABEL: @load_store_factor8( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; CHECK-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; CHECK-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; CHECK-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; CHECK-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; CHECK-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; CHECK-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; CHECK-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; CHECK-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; CHECK-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; CHECK-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; CHECK-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; CHECK-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; CHECK-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; CHECK-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; CHECK-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; CHECK-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; CHECK-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; CHECK-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; CHECK-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; CHECK-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; CHECK-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; CHECK-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; CHECK-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; CHECK-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; CHECK-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; CHECK-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; CHECK-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; CHECK-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; CHECK-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -824,23 +843,23 @@ define void @load_store_factor8(ptr %p) { ; CHECK-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; CHECK-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; CHECK-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @load_store_factor8( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 3 +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> @@ -849,39 +868,39 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> ; FIXED-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; FIXED-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; FIXED-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; FIXED-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; FIXED-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; FIXED-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; FIXED-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; FIXED-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; FIXED-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; FIXED-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; FIXED-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; FIXED-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; FIXED-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; FIXED-NEXT: [[TMP3:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) +; FIXED-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) +; FIXED-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) +; FIXED-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) +; FIXED-NEXT: [[TMP7:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) +; FIXED-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) +; FIXED-NEXT: [[TMP9:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) +; FIXED-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) +; FIXED-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP4]], <4 x i32> +; FIXED-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> [[TMP6]], <4 x i32> +; FIXED-NEXT: [[TMP13:%.*]] = shufflevector <2 x i64> [[TMP7]], <2 x i64> [[TMP8]], <4 x i32> +; FIXED-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP9]], <2 x i64> [[TMP10]], <4 x i32> +; FIXED-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> [[TMP12]], <8 x i32> +; FIXED-NEXT: [[TMP16:%.*]] = shufflevector <4 x i64> [[TMP13]], <4 x i64> [[TMP14]], <8 x i32> +; FIXED-NEXT: [[TMP17:%.*]] = shufflevector <8 x i64> [[TMP15]], <8 x i64> [[TMP16]], <16 x i32> +; FIXED-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP17]], <16 x i64> poison, <16 x i32> +; FIXED-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[Q0]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 2 +; FIXED-NEXT: [[TMP18:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; FIXED-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; FIXED-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; FIXED-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -916,64 +935,83 @@ define void @load_store_factor8(ptr %p) { ; FIXED-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; FIXED-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; FIXED-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; ; SCALABLE-LABEL: @load_store_factor8( ; SCALABLE-NEXT: entry: -; SCALABLE-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP0]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP1]] +; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP1:%.*]] = shl i64 [[TMP0]], 3 -; SCALABLE-NEXT: [[TMP2:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP1]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load <16 x i64>, ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[STRIDED_VEC7:%.*]] = shufflevector <16 x i64> [[WIDE_VEC]], <16 x i64> poison, <2 x i32> -; SCALABLE-NEXT: [[TMP4:%.*]] = add <2 x i64> [[STRIDED_VEC]], splat (i64 1) -; SCALABLE-NEXT: [[TMP6:%.*]] = add <2 x i64> [[STRIDED_VEC1]], splat (i64 2) -; SCALABLE-NEXT: [[TMP8:%.*]] = add <2 x i64> [[STRIDED_VEC2]], splat (i64 3) -; SCALABLE-NEXT: [[TMP10:%.*]] = add <2 x i64> [[STRIDED_VEC3]], splat (i64 4) -; SCALABLE-NEXT: [[TMP12:%.*]] = add <2 x i64> [[STRIDED_VEC4]], splat (i64 5) -; SCALABLE-NEXT: [[TMP14:%.*]] = add <2 x i64> [[STRIDED_VEC5]], splat (i64 6) -; SCALABLE-NEXT: [[TMP16:%.*]] = add <2 x i64> [[STRIDED_VEC6]], splat (i64 7) -; SCALABLE-NEXT: [[TMP19:%.*]] = add <2 x i64> [[STRIDED_VEC7]], splat (i64 8) -; SCALABLE-NEXT: [[TMP21:%.*]] = shufflevector <2 x i64> [[TMP4]], <2 x i64> [[TMP6]], <4 x i32> -; SCALABLE-NEXT: [[TMP22:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP10]], <4 x i32> -; SCALABLE-NEXT: [[TMP23:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> [[TMP14]], <4 x i32> -; SCALABLE-NEXT: [[TMP24:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> [[TMP19]], <4 x i32> -; SCALABLE-NEXT: [[TMP25:%.*]] = shufflevector <4 x i64> [[TMP21]], <4 x i64> [[TMP22]], <8 x i32> -; SCALABLE-NEXT: [[TMP26:%.*]] = shufflevector <4 x i64> [[TMP23]], <4 x i64> [[TMP24]], <8 x i32> -; SCALABLE-NEXT: [[TMP27:%.*]] = shufflevector <8 x i64> [[TMP25]], <8 x i64> [[TMP26]], <16 x i32> -; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = shufflevector <16 x i64> [[TMP27]], <16 x i64> poison, <16 x i32> -; SCALABLE-NEXT: store <16 x i64> [[INTERLEAVED_VEC]], ptr [[TMP2]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 -; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP3:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP3]], 3 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i64( [[WIDE_VEC]]) +; SCALABLE-NEXT: [[TMP6:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP7:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC1:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP6]]) +; SCALABLE-NEXT: [[STRIDED_VEC2:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[TMP7]]) +; SCALABLE-NEXT: [[TMP8:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 0 +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC1]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC2]], 1 +; SCALABLE-NEXT: [[STRIDED_VEC3:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP8]]) +; SCALABLE-NEXT: [[STRIDED_VEC4:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP9]]) +; SCALABLE-NEXT: [[STRIDED_VEC5:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP10]]) +; SCALABLE-NEXT: [[STRIDED_VEC6:%.*]] = call { , } @llvm.vector.deinterleave2.nxv2i64( [[TMP11]]) +; SCALABLE-NEXT: [[TMP12:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 0 +; SCALABLE-NEXT: [[TMP13:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 0 +; SCALABLE-NEXT: [[TMP14:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 0 +; SCALABLE-NEXT: [[TMP15:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 0 +; SCALABLE-NEXT: [[TMP16:%.*]] = extractvalue { , } [[STRIDED_VEC3]], 1 +; SCALABLE-NEXT: [[TMP17:%.*]] = extractvalue { , } [[STRIDED_VEC4]], 1 +; SCALABLE-NEXT: [[TMP18:%.*]] = extractvalue { , } [[STRIDED_VEC5]], 1 +; SCALABLE-NEXT: [[TMP19:%.*]] = extractvalue { , } [[STRIDED_VEC6]], 1 +; SCALABLE-NEXT: [[TMP20:%.*]] = add [[TMP12]], splat (i64 1) +; SCALABLE-NEXT: [[TMP21:%.*]] = add [[TMP13]], splat (i64 2) +; SCALABLE-NEXT: [[TMP22:%.*]] = add [[TMP14]], splat (i64 3) +; SCALABLE-NEXT: [[TMP23:%.*]] = add [[TMP15]], splat (i64 4) +; SCALABLE-NEXT: [[TMP24:%.*]] = add [[TMP16]], splat (i64 5) +; SCALABLE-NEXT: [[TMP25:%.*]] = add [[TMP17]], splat (i64 6) +; SCALABLE-NEXT: [[TMP26:%.*]] = add [[TMP18]], splat (i64 7) +; SCALABLE-NEXT: [[TMP27:%.*]] = add [[TMP19]], splat (i64 8) +; SCALABLE-NEXT: [[INTERLEAVED_VEC:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP20]], [[TMP24]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC7:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP21]], [[TMP25]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC8:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP22]], [[TMP26]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC9:%.*]] = call @llvm.vector.interleave2.nxv2i64( [[TMP23]], [[TMP27]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC10:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC]], [[INTERLEAVED_VEC8]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC11:%.*]] = call @llvm.vector.interleave2.nxv4i64( [[INTERLEAVED_VEC7]], [[INTERLEAVED_VEC9]]) +; SCALABLE-NEXT: [[INTERLEAVED_VEC12:%.*]] = call @llvm.vector.interleave2.nxv8i64( [[INTERLEAVED_VEC10]], [[INTERLEAVED_VEC11]]) +; SCALABLE-NEXT: store [[INTERLEAVED_VEC12]], ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP2]] +; SCALABLE-NEXT: [[TMP28:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP10:![0-9]+]] ; SCALABLE: middle.block: -; SCALABLE-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 3 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET8:%.*]] = shl i64 [[I1]], 3 +; SCALABLE-NEXT: [[Q8:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET8]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q8]], align 8 ; SCALABLE-NEXT: [[Y0:%.*]] = add i64 [[X0]], 1 -; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: store i64 [[Y0]], ptr [[Q8]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET8]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[Y1:%.*]] = add i64 [[X1]], 2 @@ -1008,9 +1046,9 @@ define void @load_store_factor8(ptr %p) { ; SCALABLE-NEXT: [[X7:%.*]] = load i64, ptr [[Q7]], align 8 ; SCALABLE-NEXT: [[Y7:%.*]] = add i64 [[X7]], 8 ; SCALABLE-NEXT: store i64 [[Y7]], ptr [[Q7]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP11:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP11:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1080,7 +1118,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1088,94 +1126,94 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; CHECK-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i32( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 8 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[TMP4]], align 4 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <16 x i32>, ptr [[Q0]], align 4 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <8 x i32> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 -; FIXED-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 -; FIXED-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <16 x i32> [[WIDE_VEC]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <16 x i32>, ptr [[TMP5]], align 4 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <8 x i32> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <8 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 +; FIXED-NEXT: store <8 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; FIXED-NEXT: store <8 x i32> [[TMP7]], ptr [[TMP10]], align 4 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 16 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; FIXED-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1184,7 +1222,7 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 @@ -1192,43 +1230,43 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 4 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 4 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv8i32( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 4 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 4 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP12:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q0]], align 4 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i32, ptr [[Q2]], align 4 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i32, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i32, ptr [[Q1]], align 4 ; SCALABLE-NEXT: [[RES:%.*]] = add i32 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i32, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i32 [[RES]], ptr [[DST]], align 4 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP13:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP13:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; @@ -1263,7 +1301,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1271,94 +1309,94 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; CHECK-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; CHECK-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; CHECK-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; CHECK-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; CHECK-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; CHECK-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; CHECK-NEXT: br label [[LOOP1:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; CHECK-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; CHECK-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; CHECK-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; CHECK-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; CHECK-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; CHECK-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; CHECK-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; CHECK-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; CHECK-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; CHECK-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; CHECK-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; CHECK-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; CHECK-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; CHECK-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; CHECK-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; ; FIXED-LABEL: @combine_load_factor2_i64( ; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-NEXT: br label [[LOOP:%.*]] ; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 -; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 4 -; FIXED-NEXT: [[TMP2:%.*]] = shl i64 [[TMP0]], 1 +; FIXED-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; FIXED-NEXT: [[TMP0:%.*]] = add i64 [[I]], 0 +; FIXED-NEXT: [[TMP1:%.*]] = add i64 [[I]], 4 +; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP0]], 1 ; FIXED-NEXT: [[TMP3:%.*]] = shl i64 [[TMP1]], 1 -; FIXED-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP2]] +; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[P]], i64 [[TMP3]] -; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[TMP4]], align 8 +; FIXED-NEXT: [[WIDE_VEC:%.*]] = load <8 x i64>, ptr [[Q0]], align 8 ; FIXED-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[WIDE_VEC1:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 -; FIXED-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC1]], <8 x i64> poison, <4 x i32> -; FIXED-NEXT: [[TMP8:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC3]] -; FIXED-NEXT: [[TMP9:%.*]] = add <4 x i64> [[STRIDED_VEC2]], [[STRIDED_VEC4]] -; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] -; FIXED-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[TMP10]], i32 0 -; FIXED-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP10]], i32 4 -; FIXED-NEXT: store <4 x i64> [[TMP8]], ptr [[TMP12]], align 8 -; FIXED-NEXT: store <4 x i64> [[TMP9]], ptr [[TMP13]], align 8 -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; FIXED-NEXT: [[STRIDED_VEC1:%.*]] = shufflevector <8 x i64> [[WIDE_VEC]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[WIDE_VEC2:%.*]] = load <8 x i64>, ptr [[TMP5]], align 8 +; FIXED-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i64> [[WIDE_VEC2]], <8 x i64> poison, <4 x i32> +; FIXED-NEXT: [[TMP6:%.*]] = add <4 x i64> [[STRIDED_VEC]], [[STRIDED_VEC1]] +; FIXED-NEXT: [[TMP7:%.*]] = add <4 x i64> [[STRIDED_VEC3]], [[STRIDED_VEC4]] +; FIXED-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP0]] +; FIXED-NEXT: [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i32 0 +; FIXED-NEXT: [[TMP10:%.*]] = getelementptr i64, ptr [[TMP8]], i32 4 +; FIXED-NEXT: store <4 x i64> [[TMP6]], ptr [[TMP9]], align 8 +; FIXED-NEXT: store <4 x i64> [[TMP7]], ptr [[TMP10]], align 8 +; FIXED-NEXT: [[NEXTI]] = add nuw i64 [[I]], 8 +; FIXED-NEXT: [[TMP11:%.*]] = icmp eq i64 [[NEXTI]], 1024 +; FIXED-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; FIXED: middle.block: ; FIXED-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] ; FIXED: scalar.ph: -; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; FIXED-NEXT: br label [[LOOP:%.*]] +; FIXED-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; FIXED-NEXT: br label [[LOOP1:%.*]] ; FIXED: loop: -; FIXED-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; FIXED-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; FIXED-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; FIXED-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; FIXED-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; FIXED-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; FIXED-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; FIXED-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; FIXED-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; FIXED-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; FIXED-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; FIXED-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; FIXED-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; FIXED-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; FIXED-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; FIXED-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; FIXED-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; FIXED: exit: ; FIXED-NEXT: ret void ; @@ -1367,7 +1405,7 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 ; SCALABLE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; SCALABLE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]] ; SCALABLE: vector.ph: ; SCALABLE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 @@ -1375,43 +1413,43 @@ define void @combine_load_factor2_i64(ptr noalias %p, ptr noalias %q) { ; SCALABLE-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; SCALABLE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() ; SCALABLE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 -; SCALABLE-NEXT: br label [[VECTOR_BODY:%.*]] +; SCALABLE-NEXT: br label [[LOOP:%.*]] ; SCALABLE: vector.body: -; SCALABLE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 -; SCALABLE-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 1 -; SCALABLE-NEXT: [[TMP8:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[TMP7]] -; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[TMP8]], align 8 +; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[NEXTI:%.*]], [[LOOP]] ] +; SCALABLE-NEXT: [[TMP6:%.*]] = add i64 [[I]], 0 +; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[TMP6]], 1 +; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P:%.*]], i64 [[OFFSET0]] +; SCALABLE-NEXT: [[WIDE_VEC:%.*]] = load , ptr [[Q0]], align 8 ; SCALABLE-NEXT: [[STRIDED_VEC:%.*]] = call { , } @llvm.vector.deinterleave2.nxv4i64( [[WIDE_VEC]]) -; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 -; SCALABLE-NEXT: [[TMP11:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 -; SCALABLE-NEXT: [[TMP12:%.*]] = add [[TMP10]], [[TMP11]] -; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] -; SCALABLE-NEXT: [[TMP14:%.*]] = getelementptr i64, ptr [[TMP13]], i32 0 -; SCALABLE-NEXT: store [[TMP12]], ptr [[TMP14]], align 8 -; SCALABLE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; SCALABLE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; SCALABLE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; SCALABLE-NEXT: [[TMP9:%.*]] = extractvalue { , } [[STRIDED_VEC]], 0 +; SCALABLE-NEXT: [[TMP10:%.*]] = extractvalue { , } [[STRIDED_VEC]], 1 +; SCALABLE-NEXT: [[TMP11:%.*]] = add [[TMP9]], [[TMP10]] +; SCALABLE-NEXT: [[TMP12:%.*]] = getelementptr i64, ptr [[Q:%.*]], i64 [[TMP6]] +; SCALABLE-NEXT: [[TMP13:%.*]] = getelementptr i64, ptr [[TMP12]], i32 0 +; SCALABLE-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 +; SCALABLE-NEXT: [[NEXTI]] = add nuw i64 [[I]], [[TMP5]] +; SCALABLE-NEXT: [[TMP14:%.*]] = icmp eq i64 [[NEXTI]], [[N_VEC]] +; SCALABLE-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP]], !llvm.loop [[LOOP14:![0-9]+]] ; SCALABLE: middle.block: ; SCALABLE-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; SCALABLE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; SCALABLE: scalar.ph: -; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; SCALABLE-NEXT: br label [[LOOP:%.*]] +; SCALABLE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ] +; SCALABLE-NEXT: br label [[LOOP1:%.*]] ; SCALABLE: loop: -; SCALABLE-NEXT: [[I:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI:%.*]], [[LOOP]] ] -; SCALABLE-NEXT: [[OFFSET0:%.*]] = shl i64 [[I]], 1 -; SCALABLE-NEXT: [[Q0:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET0]] -; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q0]], align 8 -; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET0]], 1 +; SCALABLE-NEXT: [[I1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[NEXTI1:%.*]], [[LOOP1]] ] +; SCALABLE-NEXT: [[OFFSET2:%.*]] = shl i64 [[I1]], 1 +; SCALABLE-NEXT: [[Q2:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET2]] +; SCALABLE-NEXT: [[X0:%.*]] = load i64, ptr [[Q2]], align 8 +; SCALABLE-NEXT: [[OFFSET1:%.*]] = add i64 [[OFFSET2]], 1 ; SCALABLE-NEXT: [[Q1:%.*]] = getelementptr i64, ptr [[P]], i64 [[OFFSET1]] ; SCALABLE-NEXT: [[X1:%.*]] = load i64, ptr [[Q1]], align 8 ; SCALABLE-NEXT: [[RES:%.*]] = add i64 [[X0]], [[X1]] -; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I]] +; SCALABLE-NEXT: [[DST:%.*]] = getelementptr i64, ptr [[Q]], i64 [[I1]] ; SCALABLE-NEXT: store i64 [[RES]], ptr [[DST]], align 8 -; SCALABLE-NEXT: [[NEXTI]] = add i64 [[I]], 1 -; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI]], 1024 -; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP15:![0-9]+]] +; SCALABLE-NEXT: [[NEXTI1]] = add i64 [[I1]], 1 +; SCALABLE-NEXT: [[DONE:%.*]] = icmp eq i64 [[NEXTI1]], 1024 +; SCALABLE-NEXT: br i1 [[DONE]], label [[EXIT]], label [[LOOP1]], !llvm.loop [[LOOP15:![0-9]+]] ; SCALABLE: exit: ; SCALABLE-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll new file mode 100644 index 0000000000000..362ec22600f92 --- /dev/null +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sve-interleave-vectorization.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -passes=loop-vectorize,interleaved-access -mattr=+sve -S -o - %s | FileCheck %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" +target triple = "aarch64" + +%struct.xyzt = type { i32, i32, i32, i32 } +; for (int i = 0; i < 1024; ++i) { +; dst[i].x = a[i].x + b[i].x; +; dst[i].y = a[i].y - b[i].y; +; dst[i].z = a[i].z << b[i].z; +; dst[i].t = a[i].t >> b[i].t; +; } + +define void @interleave_deinterleave(ptr noalias %dst, ptr %a, ptr %b) { +; CHECK-LABEL: @interleave_deinterleave( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_XYZT:%.*]], ptr [[A:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[LDN9:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv4i32( splat (i1 true), ptr [[TMP13]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , , , } [[LDN9]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , , , } [[LDN9]], 1 +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , , , } [[LDN9]], 2 +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , , , } [[LDN9]], 3 +; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP16]], [[TMP9]] +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST:%.*]], i64 [[TMP6]] +; CHECK-NEXT: [[TMP22:%.*]] = sub nsw [[TMP10]], [[TMP17]] +; CHECK-NEXT: [[TMP23:%.*]] = shl [[TMP11]], [[TMP18]] +; CHECK-NEXT: [[TMP24:%.*]] = ashr [[TMP12]], [[TMP19]] +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv4i32( [[TMP20]], [[TMP22]], [[TMP23]], [[TMP24]], splat (i1 true), ptr [[TMP21]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK: middle.block: +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[A]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP31:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[B]], i64 [[INDVARS_IV]] +; CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP32]], [[TMP31]] +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_XYZT]], ptr [[DST]], i64 [[INDVARS_IV]] +; CHECK-NEXT: store i32 [[ADD]], ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[Y:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 4 +; CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[Y]], align 4 +; CHECK-NEXT: [[Y11:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 4 +; CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[Y11]], align 4 +; CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP33]], [[TMP26]] +; CHECK-NEXT: [[Y14:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 4 +; CHECK-NEXT: store i32 [[SUB]], ptr [[Y14]], align 4 +; CHECK-NEXT: [[Z:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 8 +; CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[Z]], align 4 +; CHECK-NEXT: [[Z19:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 8 +; CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[Z19]], align 4 +; CHECK-NEXT: [[SHL:%.*]] = shl i32 [[TMP27]], [[TMP28]] +; CHECK-NEXT: [[Z22:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 8 +; CHECK-NEXT: store i32 [[SHL]], ptr [[Z22]], align 4 +; CHECK-NEXT: [[T:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX]], i64 12 +; CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[T]], align 4 +; CHECK-NEXT: [[T27:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX2]], i64 12 +; CHECK-NEXT: [[TMP30:%.*]] = load i32, ptr [[T27]], align 4 +; CHECK-NEXT: [[SHR:%.*]] = ashr i32 [[TMP29]], [[TMP30]] +; CHECK-NEXT: [[T30:%.*]] = getelementptr inbounds nuw i8, ptr [[ARRAYIDX5]], i64 12 +; CHECK-NEXT: store i32 [[SHR]], ptr [[T30]], align 4 +; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1024 +; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %gep.a = getelementptr inbounds %struct.xyzt, ptr %a, i64 %iv + %a.0 = load i32, ptr %gep.a, align 4 + %gep.b = getelementptr inbounds %struct.xyzt, ptr %b, i64 %iv + %b.0 = load i32, ptr %gep.b, align 4 + %add = add nsw i32 %b.0, %a.0 + %gep.dst = getelementptr inbounds %struct.xyzt, ptr %dst, i64 %iv + store i32 %add, ptr %gep.dst, align 4 + %gep.a.1 = getelementptr inbounds nuw i8, ptr %gep.a, i64 4 + %a.1 = load i32, ptr %gep.a.1, align 4 + %gep.b.1 = getelementptr inbounds nuw i8, ptr %gep.b, i64 4 + %b.1 = load i32, ptr %gep.b.1, align 4 + %sub = sub nsw i32 %a.1, %b.1 + %gep.dst.1 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 4 + store i32 %sub, ptr %gep.dst.1, align 4 + %gep.a.2 = getelementptr inbounds nuw i8, ptr %gep.a, i64 8 + %a.2 = load i32, ptr %gep.a.2, align 4 + %gep.b.2 = getelementptr inbounds nuw i8, ptr %gep.b, i64 8 + %b.2 = load i32, ptr %gep.b.2, align 4 + %shl = shl i32 %a.2, %b.2 + %gep.dst.2 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 8 + store i32 %shl, ptr %gep.dst.2, align 4 + %gep.a.3 = getelementptr inbounds nuw i8, ptr %gep.a, i64 12 + %a.3 = load i32, ptr %gep.a.3, align 4 + %gep.b.3 = getelementptr inbounds nuw i8, ptr %gep.b, i64 12 + %b.3 = load i32, ptr %gep.b.3, align 4 + %shr = ashr i32 %a.3, %b.3 + %gep.dst.3 = getelementptr inbounds nuw i8, ptr %gep.dst, i64 12 + store i32 %shr, ptr %gep.dst.3, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret void +}