diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e9d0315d114f6..e3374724b04a1 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -8231,13 +8231,24 @@ VPRecipeBase *VPRecipeBuilder::tryToWidenMemory(Instruction *I, bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; + VPValue *Ptr = isa(I) ? Operands[0] : Operands[1]; + if (Decision != LoopVectorizationCostModel::CM_GatherScatter && + Decision != LoopVectorizationCostModel::CM_Interleave) { + auto *VectorPtr = Reverse + ? new VPInstruction(VPInstruction::CreateVectorPtr, + {Ptr, Ptr}, I->getDebugLoc()) + : new VPInstruction(VPInstruction::CreateVectorPtr, + {Ptr}, I->getDebugLoc()); + Builder.getInsertBlock()->appendRecipe(VectorPtr); + Ptr = VectorPtr; + } if (LoadInst *Load = dyn_cast(I)) - return new VPWidenMemoryInstructionRecipe(*Load, Operands[0], Mask, - Consecutive, Reverse); + return new VPWidenMemoryInstructionRecipe(*Load, Ptr, Mask, Consecutive, + Reverse); StoreInst *Store = cast(I); - return new VPWidenMemoryInstructionRecipe(*Store, Operands[1], Operands[0], - Mask, Consecutive, Reverse); + return new VPWidenMemoryInstructionRecipe(*Store, Ptr, Operands[0], Mask, + Consecutive, Reverse); } /// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also @@ -9525,50 +9536,13 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { InnerLoopVectorizer::VectorParts BlockInMaskParts(State.UF); bool isMaskRequired = getMask(); if (isMaskRequired) - for (unsigned Part = 0; Part < State.UF; ++Part) - BlockInMaskParts[Part] = State.get(getMask(), Part); - - const auto CreateVecPtr = [&](unsigned Part, Value *Ptr) -> Value * { - // Calculate the pointer for the specific unroll-part. - Value *PartPtr = nullptr; - - // Use i32 for the gep index type when the value is constant, - // or query DataLayout for a more suitable index type otherwise. - const DataLayout &DL = - Builder.GetInsertBlock()->getModule()->getDataLayout(); - Type *IndexTy = State.VF.isScalable() && (isReverse() || Part > 0) - ? DL.getIndexType(PointerType::getUnqual( - ScalarDataTy->getContext())) - : Builder.getInt32Ty(); - bool InBounds = false; - if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) - InBounds = gep->isInBounds(); - if (isReverse()) { - // If the address is consecutive but reversed, then the - // wide store needs to start at the last vector element. - // RunTimeVF = VScale * VF.getKnownMinValue() - // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() - Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); - // NumElt = -Part * RunTimeVF - Value *NumElt = - Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); - // LastLane = 1 - RunTimeVF - Value *LastLane = - Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); - PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); - PartPtr = - Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); - if (isMaskRequired) // Reverse of a null all-one mask is a null mask. - BlockInMaskParts[Part] = - Builder.CreateVectorReverse(BlockInMaskParts[Part], "reverse"); - } else { - Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); - PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); + for (unsigned Part = 0; Part < State.UF; ++Part) { + Value *Mask = State.get(getMask(), Part); + if (isReverse()) + Mask = Builder.CreateVectorReverse(Mask, "reverse"); + BlockInMaskParts[Part] = Mask; } - return PartPtr; - }; - // Handle Stores: if (SI) { State.setDebugLocFrom(SI->getDebugLoc()); @@ -9589,8 +9563,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { // We don't want to update the value in the map as it might be used in // another expression. So don't call resetVectorValue(StoredVal). } - auto *VecPtr = - CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + auto *VecPtr = State.get(getAddr(), Part); if (isMaskRequired) NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment, BlockInMaskParts[Part]); @@ -9614,8 +9587,7 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) { nullptr, "wide.masked.gather"); State.addMetadata(NewLI, LI); } else { - auto *VecPtr = - CreateVecPtr(Part, State.get(getAddr(), VPIteration(0, 0))); + auto *VecPtr = State.get(getAddr(), Part); if (isMaskRequired) NewLI = Builder.CreateMaskedLoad( DataTy, VecPtr, Alignment, BlockInMaskParts[Part], diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index a26308a212bbd..be770e33e92a3 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1038,7 +1038,8 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue { // canonical IV separately for each unrolled part. CanonicalIVIncrementForPart, BranchOnCount, - BranchOnCond + BranchOnCond, + CreateVectorPtr }; private: @@ -1146,6 +1147,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue { case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementForPart: case VPInstruction::BranchOnCount: + case VPInstruction::CreateVectorPtr: return true; }; llvm_unreachable("switch should return"); diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 6b3218dca1b18..1dac8a806d657 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -122,6 +122,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { case VPInstruction::CalculateTripCountMinusVF: case VPInstruction::CanonicalIVIncrement: case VPInstruction::CanonicalIVIncrementForPart: + case VPInstruction::CreateVectorPtr: return false; default: return true; @@ -404,6 +405,49 @@ Value *VPInstruction::generateInstruction(VPTransformState &State, Builder.GetInsertBlock()->getTerminator()->eraseFromParent(); return CondBr; } + case VPInstruction::CreateVectorPtr: { + // Calculate the pointer for the specific unroll-part. + Value *PartPtr = nullptr; + bool IsReverse = getNumOperands() > 1; + auto *MemR = cast(*user_begin()); + Type *ScalarDataTy = + MemR->isStore() ? cast(&MemR->getIngredient()) + ->getValueOperand() + ->getType() + : cast(&MemR->getIngredient())->getType(); + // Use i32 for the gep index type when the value is constant, + // or query DataLayout for a more suitable index type otherwise. + const DataLayout &DL = + Builder.GetInsertBlock()->getModule()->getDataLayout(); + Type *IndexTy = State.VF.isScalable() && (IsReverse || Part > 0) + ? DL.getIndexType(ScalarDataTy->getPointerTo()) + : Builder.getInt32Ty(); + Value *Ptr = State.get(getOperand(0), VPIteration(0, 0)); + bool InBounds = false; + if (auto *gep = dyn_cast(Ptr->stripPointerCasts())) + InBounds = gep->isInBounds(); + if (IsReverse) { + // If the address is consecutive but reversed, then the + // wide store needs to start at the last vector element. + // RunTimeVF = VScale * VF.getKnownMinValue() + // For fixed-width VScale is 1, then RunTimeVF = VF.getKnownMinValue() + Value *RunTimeVF = getRuntimeVF(Builder, IndexTy, State.VF); + // NumElt = -Part * RunTimeVF + Value *NumElt = + Builder.CreateMul(ConstantInt::get(IndexTy, -(int64_t)Part), RunTimeVF); + // LastLane = 1 - RunTimeVF + Value *LastLane = + Builder.CreateSub(ConstantInt::get(IndexTy, 1), RunTimeVF); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, NumElt, "", InBounds); + PartPtr = + Builder.CreateGEP(ScalarDataTy, PartPtr, LastLane, "", InBounds); + } else { + Value *Increment = createStepForVF(Builder, IndexTy, State.VF, Part); + PartPtr = Builder.CreateGEP(ScalarDataTy, Ptr, Increment, "", InBounds); + } + + return PartPtr; + } default: llvm_unreachable("Unsupported opcode for instruction"); } @@ -483,6 +527,9 @@ void VPInstruction::print(raw_ostream &O, const Twine &Indent, case VPInstruction::BranchOnCount: O << "branch-on-count"; break; + case VPInstruction::CreateVectorPtr: + O << "create-vector-pointer"; + break; default: O << Instruction::getOpcodeName(getOpcode()); } diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll index 4a8e07eaaf757..cbc4733cf5cf5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll @@ -179,8 +179,8 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP6]] to <16 x i8> ; CHECK-NEXT: [[TMP9:%.*]] = sext i32 [[INDEX]] to i64 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP9]] -; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP7]], ptr [[TMP10]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP8]], ptr [[TMP11]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 32 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 992 @@ -193,18 +193,18 @@ define void @test_shrink_zext_in_preheader(ptr noalias %src, ptr noalias %dst, i ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> undef, i16 [[B]], i64 0 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP14:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> undef, i16 [[TMP14]], i64 0 ; CHECK-NEXT: [[TMP16:%.*]] = mul <8 x i16> [[TMP15]], [[TMP13]] ; CHECK-NEXT: [[TMP17:%.*]] = lshr <8 x i16> [[TMP16]], ; CHECK-NEXT: [[TMP18:%.*]] = trunc <8 x i16> [[TMP17]] to <8 x i8> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i8> [[TMP18]], <8 x i8> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX4]] to i64 +; CHECK-NEXT: [[TMP20:%.*]] = sext i32 [[INDEX3]] to i64 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP20]] ; CHECK-NEXT: store <8 x i8> [[TMP19]], ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 1000 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 1000 ; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -268,7 +268,7 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 % ; CHECK: vec.epilog.ph: ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT5:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i32 [ 992, [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP10:%.*]] = trunc i32 [[A]] to i16 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> undef, i16 [[TMP10]], i64 0 ; CHECK-NEXT: [[TMP12:%.*]] = mul <8 x i16> [[TMP11]], @@ -276,11 +276,11 @@ define void @test_shrink_select(ptr noalias %src, ptr noalias %dst, i32 %A, i1 % ; CHECK-NEXT: [[TMP14:%.*]] = lshr <8 x i16> [[TMP13]], ; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[C]], <8 x i16> [[TMP14]], <8 x i16> [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = trunc <8 x i16> [[TMP15]] to <8 x i8> -; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX2]] to i64 +; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[INDEX1]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP17]] ; CHECK-NEXT: store <8 x i8> [[TMP16]], ptr [[TMP18]], align 1 -; CHECK-NEXT: [[INDEX_NEXT5]] = add nuw i32 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT5]], 1000 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i32 [[INDEX1]], 8 +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i32 [[INDEX_NEXT4]], 1000 ; CHECK-NEXT: br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll index 24d6d2d532aa0..24c59fdb47b61 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll @@ -38,8 +38,8 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP13]]) ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 2 +; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP14]], align 1 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP15]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10000 @@ -55,22 +55,22 @@ define void @test_widen_ptr_induction(ptr %ptr.start.1) { ; CHECK-NEXT: [[IND_END5:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 10000 ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX9]], 0 -; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]] -; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX9]], 1 -; CHECK-NEXT: [[NEXT_GEP11:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]] -; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP10]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP11]], i32 1 +; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP17:%.*]] = add i64 [[INDEX8]], 0 +; CHECK-NEXT: [[NEXT_GEP9:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = add i64 [[INDEX8]], 1 +; CHECK-NEXT: [[NEXT_GEP10:%.*]] = getelementptr i8, ptr [[PTR_START_1]], i64 [[TMP18]] +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <2 x ptr> poison, ptr [[NEXT_GEP9]], i32 0 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <2 x ptr> [[TMP19]], ptr [[NEXT_GEP10]], i32 1 ; CHECK-NEXT: [[TMP21:%.*]] = icmp ne <2 x ptr> [[TMP20]], zeroinitializer ; CHECK-NEXT: [[TMP22:%.*]] = extractelement <2 x i1> [[TMP21]], i32 0 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP22]]) ; CHECK-NEXT: [[TMP23:%.*]] = extractelement <2 x i1> [[TMP21]], i32 1 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP23]]) -; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP10]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr i8, ptr [[NEXT_GEP9]], i32 0 ; CHECK-NEXT: store <2 x i8> zeroinitializer, ptr [[TMP24]], align 1 -; CHECK-NEXT: [[INDEX_NEXT12]] = add nuw i64 [[INDEX9]], 2 -; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT12]], 10000 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX8]], 2 +; CHECK-NEXT: [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT11]], 10000 ; CHECK-NEXT: br i1 [[TMP25]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -132,8 +132,8 @@ define void @test_widen_induction(ptr %A, i64 %N) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP4]], align 4 ; CHECK-NEXT: store <2 x i64> [[STEP_ADD]], ptr [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], @@ -156,13 +156,13 @@ define void @test_widen_induction(ptr %A, i64 %N) { ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT11:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND8:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT10:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[INDEX7]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 ; CHECK-NEXT: store <2 x i64> [[VEC_IND8]], ptr [[TMP9]], align 4 -; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[INDEX_NEXT11]] = add nuw i64 [[INDEX7]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT10]] = add <2 x i64> [[VEC_IND8]], ; CHECK-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT11]], [[N_VEC4]] ; CHECK-NEXT: br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} @@ -224,8 +224,8 @@ define void @test_widen_induction_variable_start(ptr %A, i64 %N, i64 %start) { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 2 +; CHECK-NEXT: store <2 x i64> [[VEC_IND]], ptr [[TMP5]], align 4 ; CHECK-NEXT: store <2 x i64> [[STEP_ADD]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], @@ -315,8 +315,8 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) { ; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> [[VEC_IND]], ; CHECK-NEXT: [[TMP5:%.*]] = add <2 x i64> [[STEP_ADD]], ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[TMP6]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; CHECK-NEXT: store <2 x i64> [[TMP4]], ptr [[TMP6]], align 4 ; CHECK-NEXT: store <2 x i64> [[TMP5]], ptr [[TMP7]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD]], @@ -339,14 +339,14 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) { ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX9:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT13:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_IND10:%.*]] = phi <2 x i64> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT12:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX9]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[VEC_IND10]], ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 ; CHECK-NEXT: store <2 x i64> [[TMP11]], ptr [[TMP12]], align 4 -; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[OFFSET_IDX]], 2 +; CHECK-NEXT: [[INDEX_NEXT13]] = add nuw i64 [[INDEX9]], 2 ; CHECK-NEXT: [[VEC_IND_NEXT12]] = add <2 x i64> [[VEC_IND10]], ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT13]], [[IND_END]] ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} @@ -410,8 +410,8 @@ define void @test_widen_extended_induction(ptr %dst) { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST:%.*]], i64 0, i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP3]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 0 -; CHECK-NEXT: store <2 x i8> [[VEC_IND]], ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 2 +; CHECK-NEXT: store <2 x i8> [[VEC_IND]], ptr [[TMP6]], align 1 ; CHECK-NEXT: store <2 x i8> [[STEP_ADD]], ptr [[TMP7]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], @@ -429,17 +429,17 @@ define void @test_widen_extended_induction(ptr %dst) { ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX8:%.*]] = trunc i32 [[INDEX4]] to i8 -; CHECK-NEXT: [[TMP9:%.*]] = add i8 [[OFFSET_IDX8]], 0 +; CHECK-NEXT: [[INDEX3:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = trunc i32 [[INDEX3]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = add i8 [[OFFSET_IDX7]], 0 ; CHECK-NEXT: [[TMP10:%.*]] = zext i8 [[TMP9]] to i64 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds [6 x i8], ptr [[DST]], i64 0, i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <2 x i8> [[VEC_IND5]], ptr [[TMP12]], align 1 -; CHECK-NEXT: [[INDEX_NEXT9]] = add nuw i32 [[INDEX4]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT9]], 10000 +; CHECK-NEXT: store <2 x i8> [[VEC_IND4]], ptr [[TMP12]], align 1 +; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i32 [[INDEX3]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <2 x i8> [[VEC_IND4]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT8]], 10000 ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -492,8 +492,8 @@ define void @test_widen_truncated_induction(ptr %A) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; CHECK-NEXT: store <2 x i8> [[VEC_IND]], ptr [[TMP4]], align 1 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 2 +; CHECK-NEXT: store <2 x i8> [[VEC_IND]], ptr [[TMP4]], align 1 ; CHECK-NEXT: store <2 x i8> [[STEP_ADD]], ptr [[TMP5]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i8> [[STEP_ADD]], @@ -512,15 +512,15 @@ define void @test_widen_truncated_induction(ptr %A) { ; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i8> [[DOTSPLAT]], ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[OFFSET_IDX:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND5:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[OFFSET_IDX]], 0 +; CHECK-NEXT: [[INDEX3:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT7:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND4:%.*]] = phi <2 x i8> [ [[INDUCTION]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX3]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: store <2 x i8> [[VEC_IND5]], ptr [[TMP10]], align 1 -; CHECK-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[OFFSET_IDX]], 2 -; CHECK-NEXT: [[VEC_IND_NEXT7]] = add <2 x i8> [[VEC_IND5]], -; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 10000 +; CHECK-NEXT: store <2 x i8> [[VEC_IND4]], ptr [[TMP10]], align 1 +; CHECK-NEXT: [[INDEX_NEXT7]] = add nuw i64 [[INDEX3]], 2 +; CHECK-NEXT: [[VEC_IND_NEXT6]] = add <2 x i8> [[VEC_IND4]], +; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT7]], 10000 ; CHECK-NEXT: br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], {{!llvm.loop ![0-9]+}} ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll index e80630412d67b..33d7a3a3c8ac0 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/fixed-order-recurrence.ll @@ -30,8 +30,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> @@ -40,8 +40,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -129,8 +129,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> ; CHECK-NEXT: [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> @@ -147,8 +147,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16 +; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll index 760c294c8853a..0e54bd15e5ea5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-load-store.ll @@ -41,12 +41,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP3]] ; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP4]] ; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 0 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 ; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 ; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 32 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 ; INTERLEAVE-4-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 48 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1 ; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1 ; INTERLEAVE-4-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; INTERLEAVE-4-NEXT: [[TMP14:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] @@ -65,12 +65,12 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-4-NEXT: [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP3]] ; INTERLEAVE-4-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP4]] ; INTERLEAVE-4-NEXT: [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0 -; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1 ; INTERLEAVE-4-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 16 -; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1 ; INTERLEAVE-4-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 32 -; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1 ; INTERLEAVE-4-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 48 +; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP21]], ptr [[TMP29]], align 1 +; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP22]], ptr [[TMP30]], align 1 +; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP23]], ptr [[TMP31]], align 1 ; INTERLEAVE-4-NEXT: store <16 x i8> [[TMP24]], ptr [[TMP32]], align 1 ; INTERLEAVE-4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; INTERLEAVE-4-NEXT: [[TMP33:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -155,8 +155,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP1]] ; INTERLEAVE-2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[TMP2]] ; INTERLEAVE-2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; INTERLEAVE-2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 +; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; INTERLEAVE-2-NEXT: [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1 ; INTERLEAVE-2-NEXT: [[TMP7:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; INTERLEAVE-2-NEXT: [[TMP8:%.*]] = icmp sgt <16 x i8> [[WIDE_LOAD4]], [[BROADCAST_SPLAT]] @@ -167,8 +167,8 @@ define void @interleave_single_load_store(ptr %src, ptr %dst, i64 %N, i8 %a, i8 ; INTERLEAVE-2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP1]] ; INTERLEAVE-2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP2]] ; INTERLEAVE-2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 0 -; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1 ; INTERLEAVE-2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i32 16 +; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP11]], ptr [[TMP15]], align 1 ; INTERLEAVE-2-NEXT: store <16 x i8> [[TMP12]], ptr [[TMP16]], align 1 ; INTERLEAVE-2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; INTERLEAVE-2-NEXT: [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll index c4f60c8e0911f..0d7a1dbc841e5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/interleaving-reduction.ll @@ -32,12 +32,12 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP2]] ; INTERLEAVE-4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP3]] ; INTERLEAVE-4-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 ; INTERLEAVE-4-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 4 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 ; INTERLEAVE-4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 8 -; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 ; INTERLEAVE-4-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 12 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP8]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP9]], align 1 +; INTERLEAVE-4-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP10]], align 1 ; INTERLEAVE-4-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP11]], align 1 ; INTERLEAVE-4-NEXT: [[TMP12]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; INTERLEAVE-4-NEXT: [[TMP13]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD4]] @@ -87,8 +87,8 @@ define i32 @interleave_integer_reduction(ptr %src, i64 %N) { ; INTERLEAVE-2-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i64 [[TMP0]] ; INTERLEAVE-2-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i64 [[TMP1]] ; INTERLEAVE-2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1 ; INTERLEAVE-2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 4 +; INTERLEAVE-2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP4]], align 1 ; INTERLEAVE-2-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP5]], align 1 ; INTERLEAVE-2-NEXT: [[TMP6]] = add <4 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; INTERLEAVE-2-NEXT: [[TMP7]] = add <4 x i32> [[VEC_PHI1]], [[WIDE_LOAD2]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll index 52df1a38f5b42..95dc010811e6c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll @@ -38,13 +38,13 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP4]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8 +; CHECK-NEXT: store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2 ; CHECK-NEXT: store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -121,13 +121,13 @@ define void @umin(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD]], <16 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.umin.v16i8(<16 x i8> [[WIDE_LOAD5]], <16 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP3]], i64 16 +; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[NEXT_GEP3]], align 2 ; CHECK-NEXT: store <16 x i8> [[TMP3]], ptr [[TMP4]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll index f045f486eac27..99d43f03906b2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -258,18 +258,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP34]] = fadd [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-UNORDERED-NEXT: [[TMP35]] = fadd [[WIDE_LOAD4]], [[VEC_PHI1]] @@ -341,18 +341,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[WIDE_LOAD]]) ; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[TMP34]], [[WIDE_LOAD1]]) @@ -458,18 +458,18 @@ define float @fadd_strict_unroll(ptr noalias nocapture readonly %a, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] ; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] ; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 ; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 ; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = select [[ACTIVE_LANE_MASK]], [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, float -0.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], [[TMP61]]) @@ -1526,36 +1526,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] ; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] ; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] ; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] ; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 ; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 ; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 ; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP48]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) ; CHECK-UNORDERED-NEXT: [[TMP49]] = call @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) @@ -1629,36 +1629,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] ; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] ; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] ; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] ; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 ; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 ; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 ; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 ; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 ; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 ; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul [[WIDE_LOAD]], [[WIDE_LOAD4]] ; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul [[WIDE_LOAD1]], [[WIDE_LOAD5]] @@ -1770,36 +1770,36 @@ define float @fmuladd_strict(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] ; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] ; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 ; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 ; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] ; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] ; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] ; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] ; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 ; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 ; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] ; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] @@ -1940,36 +1940,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-UNORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-UNORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-UNORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-UNORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-UNORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-UNORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] ; CHECK-UNORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] ; CHECK-UNORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] ; CHECK-UNORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] ; CHECK-UNORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 ; CHECK-UNORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 ; CHECK-UNORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-UNORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 ; CHECK-UNORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP38]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD8:%.*]] = load , ptr [[TMP41]], align 4 +; CHECK-UNORDERED-NEXT: [[WIDE_LOAD9:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-UNORDERED-NEXT: [[WIDE_LOAD10:%.*]] = load , ptr [[TMP47]], align 4 ; CHECK-UNORDERED-NEXT: [[TMP48]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD]], [[WIDE_LOAD7]], [[VEC_PHI]]) ; CHECK-UNORDERED-NEXT: [[TMP49]] = call nnan @llvm.fmuladd.nxv8f32( [[WIDE_LOAD4]], [[WIDE_LOAD8]], [[VEC_PHI1]]) @@ -2043,36 +2043,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; CHECK-ORDERED-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP19]] ; CHECK-ORDERED-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 ; CHECK-ORDERED-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP26:%.*]] = mul i64 [[TMP25]], 8 ; CHECK-ORDERED-NEXT: [[TMP27:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP26]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 ; CHECK-ORDERED-NEXT: [[TMP28:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP29:%.*]] = mul i64 [[TMP28]], 16 ; CHECK-ORDERED-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP29]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[TMP31:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP32:%.*]] = mul i64 [[TMP31]], 24 ; CHECK-ORDERED-NEXT: [[TMP33:%.*]] = getelementptr inbounds float, ptr [[TMP20]], i64 [[TMP32]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP24]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP27]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP30]], align 4 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP33]], align 4 ; CHECK-ORDERED-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP4]] ; CHECK-ORDERED-NEXT: [[TMP35:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP9]] ; CHECK-ORDERED-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP14]] ; CHECK-ORDERED-NEXT: [[TMP37:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP19]] ; CHECK-ORDERED-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i32 0 -; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 ; CHECK-ORDERED-NEXT: [[TMP39:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP40:%.*]] = mul i64 [[TMP39]], 8 ; CHECK-ORDERED-NEXT: [[TMP41:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP40]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 ; CHECK-ORDERED-NEXT: [[TMP42:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP43:%.*]] = mul i64 [[TMP42]], 16 ; CHECK-ORDERED-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP43]] -; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-ORDERED-NEXT: [[TMP45:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-NEXT: [[TMP46:%.*]] = mul i64 [[TMP45]], 24 ; CHECK-ORDERED-NEXT: [[TMP47:%.*]] = getelementptr inbounds float, ptr [[TMP34]], i64 [[TMP46]] +; CHECK-ORDERED-NEXT: [[WIDE_LOAD4:%.*]] = load , ptr [[TMP38]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD5:%.*]] = load , ptr [[TMP41]], align 4 +; CHECK-ORDERED-NEXT: [[WIDE_LOAD6:%.*]] = load , ptr [[TMP44]], align 4 ; CHECK-ORDERED-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP47]], align 4 ; CHECK-ORDERED-NEXT: [[TMP48:%.*]] = fmul nnan [[WIDE_LOAD]], [[WIDE_LOAD4]] ; CHECK-ORDERED-NEXT: [[TMP49:%.*]] = fmul nnan [[WIDE_LOAD1]], [[WIDE_LOAD5]] @@ -2184,36 +2184,36 @@ define float @fmuladd_strict_fmf(ptr %a, ptr %b, i64 %n) #0 { ; CHECK-ORDERED-TF-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP41]] ; CHECK-ORDERED-TF-NEXT: [[TMP50:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP46]] ; CHECK-ORDERED-TF-NEXT: [[TMP51:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP54:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 16 ; CHECK-ORDERED-TF-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 24 ; CHECK-ORDERED-TF-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP47]], i64 [[TMP59]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD9:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP61:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP31]] ; CHECK-ORDERED-TF-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP36]] ; CHECK-ORDERED-TF-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP41]] ; CHECK-ORDERED-TF-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP46]] ; CHECK-ORDERED-TF-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i32 0 -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP66:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP67:%.*]] = mul i64 [[TMP66]], 8 ; CHECK-ORDERED-TF-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP67]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP69:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP70:%.*]] = mul i64 [[TMP69]], 16 ; CHECK-ORDERED-TF-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP70]] -; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP72:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-ORDERED-TF-NEXT: [[TMP73:%.*]] = mul i64 [[TMP72]], 24 ; CHECK-ORDERED-TF-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP61]], i64 [[TMP73]] +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP65]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD13:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP68]], i32 4, [[ACTIVE_LANE_MASK6]], poison) +; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD14:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP71]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-ORDERED-TF-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call @llvm.masked.load.nxv8f32.p0(ptr [[TMP74]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-ORDERED-TF-NEXT: [[TMP75:%.*]] = fmul nnan [[WIDE_MASKED_LOAD]], [[WIDE_MASKED_LOAD12]] ; CHECK-ORDERED-TF-NEXT: [[TMP76:%.*]] = fmul nnan [[WIDE_MASKED_LOAD9]], [[WIDE_MASKED_LOAD13]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll index 23fd5fe5e908e..54095156ce2b6 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect-strict-reductions.ll @@ -32,10 +32,10 @@ define float @fadd_strict(ptr noalias nocapture readonly %a, i64 %n) { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP12]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[VEC_PHI]], [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP17]] = call float @llvm.vector.reduce.fadd.nxv4f32(float [[TMP16]], [[WIDE_LOAD2]]) diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll index 6982b3f0943ce..77a3f704402e5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll @@ -47,10 +47,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP11]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 16 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP12]], i64 [[TMP16]] +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP14]], align 1 ; CHECK-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP17]], align 1 ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 32 @@ -125,10 +125,10 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]] ; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 0 -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 16 ; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i8 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 ; CHECK-VF8-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 32 @@ -146,13 +146,13 @@ define void @main_vf_vscale_x_16(ptr %A) #0 { ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-VF8: vec.epilog.vector.body: -; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP19]] ; CHECK-VF8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP20]], i32 0 ; CHECK-VF8-NEXT: store <8 x i8> , ptr [[TMP21]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8 -; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 +; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 ; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -225,10 +225,10 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 @@ -246,13 +246,13 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0 ; CHECK-NEXT: store <8 x i64> , ptr [[TMP21]], align 1 -; CHECK-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8 -; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 +; CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 ; CHECK-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -294,10 +294,10 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP4]] ; CHECK-VF8-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP9]] ; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i32 0 -; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 2 ; CHECK-VF8-NEXT: [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP12]], align 1 ; CHECK-VF8-NEXT: store shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer), ptr [[TMP15]], align 1 ; CHECK-VF8-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 4 @@ -315,13 +315,13 @@ define void @main_vf_vscale_x_2(ptr %A) #0 vscale_range(8, 8) { ; CHECK-VF8-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-VF8: vec.epilog.vector.body: -; CHECK-VF8-NEXT: [[INDEX2:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT3:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP19:%.*]] = add i64 [[INDEX2]], 0 +; CHECK-VF8-NEXT: [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP19:%.*]] = add i64 [[INDEX1]], 0 ; CHECK-VF8-NEXT: [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP19]] ; CHECK-VF8-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP20]], i32 0 ; CHECK-VF8-NEXT: store <8 x i64> , ptr [[TMP21]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT3]] = add nuw i64 [[INDEX2]], 8 -; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 1024 +; CHECK-VF8-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8 +; CHECK-VF8-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024 ; CHECK-VF8-NEXT: br i1 [[TMP22]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] @@ -384,10 +384,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], [[TMP9]] ; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-NEXT: store zeroinitializer, ptr [[TMP11]], align 1 ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = mul i64 [[TMP12]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP13]] +; CHECK-NEXT: store zeroinitializer, ptr [[TMP11]], align 1 ; CHECK-NEXT: store zeroinitializer, ptr [[TMP14]], align 1 ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = mul i64 [[TMP15]], 32 @@ -467,10 +467,10 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], [[TMP7]] ; CHECK-VF8-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP8]] ; CHECK-VF8-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 0 -; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 ; CHECK-VF8-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 16 ; CHECK-VF8-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 [[TMP11]] +; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP9]], align 1 ; CHECK-VF8-NEXT: store zeroinitializer, ptr [[TMP12]], align 1 ; CHECK-VF8-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-VF8-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 32 @@ -491,13 +491,13 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 { ; CHECK-VF8-NEXT: [[IND_END3:%.*]] = getelementptr i8, ptr [[START]], i64 10000 ; CHECK-VF8-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-VF8: vec.epilog.vector.body: -; CHECK-VF8-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-VF8-NEXT: [[TMP16:%.*]] = add i64 [[INDEX7]], 0 -; CHECK-VF8-NEXT: [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP16]] -; CHECK-VF8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP8]], i32 0 +; CHECK-VF8-NEXT: [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-VF8-NEXT: [[TMP16:%.*]] = add i64 [[INDEX6]], 0 +; CHECK-VF8-NEXT: [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP16]] +; CHECK-VF8-NEXT: [[TMP17:%.*]] = getelementptr i8, ptr [[NEXT_GEP7]], i32 0 ; CHECK-VF8-NEXT: store <8 x i8> zeroinitializer, ptr [[TMP17]], align 1 -; CHECK-VF8-NEXT: [[INDEX_NEXT9]] = add nuw i64 [[INDEX7]], 8 -; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 10000 +; CHECK-VF8-NEXT: [[INDEX_NEXT8]] = add nuw i64 [[INDEX6]], 8 +; CHECK-VF8-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 10000 ; CHECK-VF8-NEXT: br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] ; CHECK-VF8: vec.epilog.middle.block: ; CHECK-VF8-NEXT: br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll index 995950e46f533..dcb37f2a5464d 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-fneg.ll @@ -37,19 +37,19 @@ define void @fneg(ptr nocapture noundef writeonly %d, ptr nocapture noundef read ; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds half, ptr [[S]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = mul i64 [[TMP11]], 8 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds half, ptr [[TMP9]], i64 [[TMP12]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 2 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP13]], align 2 ; CHECK-NEXT: [[TMP14:%.*]] = fneg [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP15:%.*]] = fneg [[WIDE_LOAD3]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds half, ptr [[D]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i32 0 -; CHECK-NEXT: store [[TMP14]], ptr [[TMP17]], align 2 ; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP19:%.*]] = mul i64 [[TMP18]], 8 ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds half, ptr [[TMP16]], i64 [[TMP19]] +; CHECK-NEXT: store [[TMP14]], ptr [[TMP17]], align 2 ; CHECK-NEXT: store [[TMP15]], ptr [[TMP20]], align 2 ; CHECK-NEXT: [[TMP21:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP22:%.*]] = mul i64 [[TMP21]], 16 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll index 3a867cf401b33..be6a7b40edabc 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-gather-scatter.ll @@ -309,10 +309,10 @@ define void @gather_nxv4i32_ind64_stride2(ptr noalias nocapture %a, ptr noalias ; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP8]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[WIDE_MASKED_GATHER2:%.*]] = call @llvm.masked.gather.nxv4f32.nxv4p0( [[TMP9]], i32 4, shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), poison) ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 2 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds float, ptr [[TMP10]], i64 [[TMP12]] +; CHECK-NEXT: store [[WIDE_MASKED_GATHER]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store [[WIDE_MASKED_GATHER2]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll index 79bda0a4a5d80..544dff0ee5f0b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-multiexit.ll @@ -45,18 +45,18 @@ define void @multiple_exits_unique_exit_block(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP22]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP27]] +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP28]], align 4 ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 @@ -143,18 +143,18 @@ define i32 @multiple_exits_multiple_exit_blocks(ptr %A, ptr %B, i32 %N) #0 { ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 4 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP21]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP19]], align 4 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP22]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP11]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP16]] ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 -; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP27:%.*]] = mul i64 [[TMP26]], 4 ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP27]] +; CHECK-NEXT: store [[WIDE_LOAD]], ptr [[TMP25]], align 4 ; CHECK-NEXT: store [[WIDE_LOAD3]], ptr [[TMP28]], align 4 ; CHECK-NEXT: [[TMP29:%.*]] = call i32 @llvm.vscale.i32() ; CHECK-NEXT: [[TMP30:%.*]] = mul i32 [[TMP29]], 8 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll index 2ca5fbebed877..b42e8ab04376f 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -28,7 +28,8 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-NEXT: ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]> ; VPLANS-NEXT: vp<[[STEP:%[0-9]+]]> = SCALAR-STEPS vp<[[INDV]]>, ir<1> ; VPLANS-NEXT: CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]> -; VPLANS-NEXT: WIDEN store ir<%gep>, ir<%val>, vp<[[LANEMASK_PHI]]> +; VPLANS-NEXT: EMIT vp<[[VEC_PTR:%[0-9]+]]> = create-vector-pointer ir<%gep> +; VPLANS-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%val>, vp<[[LANEMASK_PHI]]> ; VPLANS-NEXT: EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = VF * UF + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]> ; VPLANS-NEXT: EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]> diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll index caeb37bce2f18..ca3e725e369a2 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-unroll.ll @@ -81,18 +81,18 @@ define void @simple_memset(i32 %val, ptr %ptr, i64 %n) #0 { ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP46]] ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]]) ; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) ; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) ; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]]) ; CHECK-NEXT: [[TMP61:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP62:%.*]] = mul i64 [[TMP61]], 16 @@ -223,18 +223,18 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP49:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP41]] ; CHECK-NEXT: [[TMP50:%.*]] = getelementptr i32, ptr [[COND_PTR]], i64 [[TMP46]] ; CHECK-NEXT: [[TMP51:%.*]] = getelementptr i32, ptr [[TMP47]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) ; CHECK-NEXT: [[TMP52:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP53:%.*]] = mul i64 [[TMP52]], 4 ; CHECK-NEXT: [[TMP54:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP53]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]], poison) ; CHECK-NEXT: [[TMP55:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP56:%.*]] = mul i64 [[TMP55]], 8 ; CHECK-NEXT: [[TMP57:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP56]] -; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-NEXT: [[TMP58:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP59:%.*]] = mul i64 [[TMP58]], 12 ; CHECK-NEXT: [[TMP60:%.*]] = getelementptr i32, ptr [[TMP47]], i64 [[TMP59]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP51]], i32 4, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD10:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP54]], i32 4, [[ACTIVE_LANE_MASK7]], poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD11:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP57]], i32 4, [[ACTIVE_LANE_MASK8]], poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD12:%.*]] = call @llvm.masked.load.nxv4i32.p0(ptr [[TMP60]], i32 4, [[ACTIVE_LANE_MASK9]], poison) ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne [[WIDE_MASKED_LOAD]], zeroinitializer ; CHECK-NEXT: [[TMP62:%.*]] = icmp ne [[WIDE_MASKED_LOAD10]], zeroinitializer @@ -249,18 +249,18 @@ define void @cond_memset(i32 %val, ptr noalias readonly %cond_ptr, ptr noalias % ; CHECK-NEXT: [[TMP71:%.*]] = select [[ACTIVE_LANE_MASK8]], [[TMP63]], zeroinitializer ; CHECK-NEXT: [[TMP72:%.*]] = select [[ACTIVE_LANE_MASK9]], [[TMP64]], zeroinitializer ; CHECK-NEXT: [[TMP73:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP69]]) ; CHECK-NEXT: [[TMP74:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP75:%.*]] = mul i64 [[TMP74]], 4 ; CHECK-NEXT: [[TMP76:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP75]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP70]]) ; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP78:%.*]] = mul i64 [[TMP77]], 8 ; CHECK-NEXT: [[TMP79:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP78]] -; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, [[TMP71]]) ; CHECK-NEXT: [[TMP80:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP81:%.*]] = mul i64 [[TMP80]], 12 ; CHECK-NEXT: [[TMP82:%.*]] = getelementptr i32, ptr [[TMP65]], i64 [[TMP81]] +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP73]], i32 4, [[TMP69]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP76]], i32 4, [[TMP70]]) +; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP79]], i32 4, [[TMP71]]) ; CHECK-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT]], ptr [[TMP82]], i32 4, [[TMP72]]) ; CHECK-NEXT: [[TMP83:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP84:%.*]] = mul i64 [[TMP83]], 16 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll index 58c54103c72c6..70833e44b075a 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse-mask4.ll @@ -22,8 +22,8 @@ define void @vector_reverse_mask_nxv4i1(ptr %a, ptr %cond, i64 %N) #0 { ; CHECK: %[[WIDEMSKLOAD:.*]] = call @llvm.masked.load.nxv4f64.p0(ptr %{{.*}}, i32 8, %[[REVERSE6]], poison) ; CHECK: %[[REVERSE7:.*]] = call @llvm.experimental.vector.reverse.nxv4f64( %[[WIDEMSKLOAD]]) ; CHECK: %[[FADD:.*]] = fadd %[[REVERSE7]] +; CHECK: %[[REVERSE9:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) ; CHECK: %[[REVERSE8:.*]] = call @llvm.experimental.vector.reverse.nxv4f64( %[[FADD]]) -; CHECK: %[[REVERSE9:.*]] = call @llvm.experimental.vector.reverse.nxv4i1( %{{.*}}) ; CHECK: call void @llvm.masked.store.nxv4f64.p0( %[[REVERSE8]], ptr %{{.*}}, i32 8, %[[REVERSE9]] entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll index e5a7ef38caacc..012125bc6f4aa 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-vector-reverse.ll @@ -33,13 +33,13 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[TMP8:%.*]] = shl i64 [[TMP7]], 3 ; CHECK-NEXT: [[TMP9:%.*]] = sub i64 1, [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP9]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP12:%.*]] = shl i64 [[TMP11]], 3 ; CHECK-NEXT: [[TMP13:%.*]] = sub i64 0, [[TMP12]] ; CHECK-NEXT: [[TMP14:%.*]] = sub i64 1, [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP15]], i64 [[TMP14]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP10]], align 8 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP17:%.*]] = fadd [[WIDE_LOAD]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP18:%.*]] = fadd [[WIDE_LOAD1]], shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) @@ -48,13 +48,13 @@ define void @vector_reverse_f64(i64 %N, ptr noalias %a, ptr noalias %b) #0{ ; CHECK-NEXT: [[TMP21:%.*]] = shl i64 [[TMP20]], 3 ; CHECK-NEXT: [[TMP22:%.*]] = sub i64 1, [[TMP21]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP22]] -; CHECK-NEXT: store [[TMP17]], ptr [[TMP23]], align 8 ; CHECK-NEXT: [[TMP24:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP25:%.*]] = shl i64 [[TMP24]], 3 ; CHECK-NEXT: [[TMP26:%.*]] = sub i64 0, [[TMP25]] ; CHECK-NEXT: [[TMP27:%.*]] = sub i64 1, [[TMP25]] ; CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i64 [[TMP26]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds double, ptr [[TMP28]], i64 [[TMP27]] +; CHECK-NEXT: store [[TMP17]], ptr [[TMP23]], align 8 ; CHECK-NEXT: store [[TMP18]], ptr [[TMP29]], align 8 ; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 4 @@ -138,13 +138,13 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP14:%.*]] = shl i64 [[TMP13]], 3 ; CHECK-NEXT: [[TMP15:%.*]] = sub i64 1, [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP15]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 3 ; CHECK-NEXT: [[TMP19:%.*]] = sub i64 0, [[TMP18]] ; CHECK-NEXT: [[TMP20:%.*]] = sub i64 1, [[TMP18]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[TMP12]], i64 [[TMP19]] ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP21]], i64 [[TMP20]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP16]], align 8 ; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load , ptr [[TMP22]], align 8 ; CHECK-NEXT: [[TMP23:%.*]] = add [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP24:%.*]] = add [[WIDE_LOAD3]], shufflevector ( insertelement ( poison, i64 1, i64 0), poison, zeroinitializer) @@ -153,13 +153,13 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: [[TMP27:%.*]] = shl i64 [[TMP26]], 3 ; CHECK-NEXT: [[TMP28:%.*]] = sub i64 1, [[TMP27]] ; CHECK-NEXT: [[TMP29:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP28]] -; CHECK-NEXT: store [[TMP23]], ptr [[TMP29]], align 8 ; CHECK-NEXT: [[TMP30:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP31:%.*]] = shl i64 [[TMP30]], 3 ; CHECK-NEXT: [[TMP32:%.*]] = sub i64 0, [[TMP31]] ; CHECK-NEXT: [[TMP33:%.*]] = sub i64 1, [[TMP31]] ; CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP32]] ; CHECK-NEXT: [[TMP35:%.*]] = getelementptr inbounds i64, ptr [[TMP34]], i64 [[TMP33]] +; CHECK-NEXT: store [[TMP23]], ptr [[TMP29]], align 8 ; CHECK-NEXT: store [[TMP24]], ptr [[TMP35]], align 8 ; CHECK-NEXT: [[TMP36:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP37:%.*]] = shl i64 [[TMP36]], 4 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll index 62c778aca51d0..3d413d10bbad8 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-gep.ll @@ -21,10 +21,13 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT ir<%ptr.iv.1> = WIDEN-POINTER-INDUCTION ir<%start.1>, 8 ; CHECK-NEXT: EMIT ir<%ptr.iv.2> = WIDEN-POINTER-INDUCTION ir<%start.2>, 1 ; CHECK-NEXT: WIDEN-GEP Var[Inv] ir<%ptr.iv.2.next> = getelementptr inbounds ir<%ptr.iv.2>, ir<1> -; CHECK-NEXT: WIDEN store ir<%ptr.iv.1>, ir<%ptr.iv.2.next> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%ptr.iv.2> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%ptr.iv.1> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%ptr.iv.2.next> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%ptr.iv.2> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR2]]> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%lv>, ir<1> -; CHECK-NEXT: WIDEN store ir<%ptr.iv.2>, ir<%add> +; CHECK-NEXT: EMIT vp<[[VEC_PTR3:%.+]]> = create-vector-pointer ir<%ptr.iv.2> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR3]]>, ir<%add> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll index 9cd2781ae2359..09e6e7b53007e 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll @@ -48,18 +48,18 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt ; CHECK-NEXT: [[TMP13:%.*]] = add nsw [[TMP9]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP14:%.*]] = add nsw [[TMP11]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP17]] +; CHECK-NEXT: store [[TMP13]], ptr [[TMP15]], align 4 ; CHECK-NEXT: store [[TMP14]], ptr [[TMP18]], align 4 ; CHECK-NEXT: [[TMP19:%.*]] = add nsw [[TMP10]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP20:%.*]] = add nsw [[TMP12]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP19]], ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2 ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP23]] +; CHECK-NEXT: store [[TMP19]], ptr [[TMP21]], align 4 ; CHECK-NEXT: store [[TMP20]], ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3 @@ -150,17 +150,17 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP6:%.*]] = shl i64 [[INDEX]], 2 ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 [[TMP8]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load , ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = shl nsw [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) ; CHECK-NEXT: [[TMP11:%.*]] = shl nsw [[WIDE_LOAD7]], shufflevector ( insertelement ( poison, i32 1, i64 0), poison, zeroinitializer) -; CHECK-NEXT: store [[TMP10]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP13]] +; CHECK-NEXT: store [[TMP10]], ptr [[NEXT_GEP5]], align 4 ; CHECK-NEXT: store [[TMP11]], ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll index eaabc263913a7..83f582944d03b 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/synthesize-mask-for-call.ll @@ -21,10 +21,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: REPLICATE ir<%call> = call @foo(ir<%load>) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -47,10 +49,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -78,10 +82,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXST:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -104,10 +110,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>, ir) (using library function: foo_vector_fixed4_mask) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -134,10 +142,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed2_nomask) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -160,10 +170,12 @@ target triple = "aarch64-unknown-linux-gnu" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%load> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%load> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @foo(ir<%load>) (using library function: foo_vector_fixed4_nomask) ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll index e8e2008912c83..0158d4c2c6c0c 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll @@ -35,24 +35,24 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N) ; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[TMP0]], [[N]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -7 ; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8 ; CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -7 ; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8 ; CHECK-NEXT: [[REVERSE2:%.*]] = shufflevector <4 x double> [[WIDE_LOAD1]], <4 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i64 -3 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[TMP7]], i64 -7 ; CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> +; CHECK-NEXT: [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> ; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison) -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[TMP7]], i64 -7 -; CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> -; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]], <4 x double> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]], <4 x double> poison) ; CHECK-NEXT: [[TMP10:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD]], ; CHECK-NEXT: [[TMP11:%.*]] = fadd <4 x double> [[WIDE_MASKED_LOAD6]], ; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP10]], ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]]) -; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE5]]) +; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> [[TMP11]], ptr [[TMP9]], i32 8, <4 x i1> [[REVERSE4]]) ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll index 6bd305fe33638..2abc787061b53 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse.ll @@ -17,9 +17,9 @@ define void @vector_reverse_f64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x double> %[[WIDE]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = fadd <8 x double> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds double, ptr {{.*}}, i64 {{.*}} -; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds double, ptr %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds double, ptr %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x double> %[[FADD]], <8 x double> poison, <8 x i32> ; CHECK-NEXT: store <8 x double> %[[REVERSE6]], ptr %[[GEP4]], align 8 entry: @@ -50,9 +50,9 @@ define void @vector_reverse_i64(i64 %N, ptr %a, ptr %b) #0 { ; CHECK-NEXT: %[[REVERSE:.*]] = shufflevector <8 x i64> %[[WIDE]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: %[[FADD:.*]] = add <8 x i64> %[[REVERSE]] ; CHECK-NEXT: %[[GEP2:.*]] = getelementptr inbounds i64, ptr {{.*}}, i64 {{.*}} -; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: %[[GEP3:.*]] = getelementptr inbounds i64, ptr %[[GEP2]], i32 0 ; CHECK-NEXT: %[[GEP4:.*]] = getelementptr inbounds i64, ptr %[[GEP3]], i32 -7 +; CHECK-NEXT: %[[REVERSE6:.*]] = shufflevector <8 x i64> %[[FADD]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: store <8 x i64> %[[REVERSE6]], ptr %[[GEP4]], align 8 entry: diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll index f82eb9c7822f2..44c78e4745de5 100644 --- a/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/widen-call-with-intrinsic-or-libfunc.ll @@ -18,7 +18,8 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using library function: __simd_sin_v2f64) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> @@ -45,7 +46,8 @@ target triple = "arm64-apple-ios" ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.src> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%l> = load ir<%gep.src> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.src> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CAST ir<%conv> = fpext ir<%l> to double ; CHECK-NEXT: WIDEN-CALL ir<%s> = call @llvm.sin.f64(ir<%conv>) (using vector intrinsic) ; CHECK-NEXT: REPLICATE ir<%gep.dst> = getelementptr inbounds ir<%dst>, vp<[[STEPS]]> diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll index c00f747d54db1..c3f307c234ed9 100644 --- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll +++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll @@ -753,8 +753,8 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr ; CHECK-NEXT: [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> , <4 x i32> poison) ; CHECK-NEXT: [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4 +; CHECK-NEXT: store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 192 @@ -824,12 +824,12 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr ; CHECK-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]] ; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4 -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 8 -; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 12 +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4 ; CHECK-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16 ; CHECK-NEXT: [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 384 diff --git a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll index a448b3086c449..c358c835597dd 100644 --- a/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/PowerPC/optimal-epilog-vectorization.ll @@ -48,28 +48,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-TWO-CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]] ; VF-TWO-CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]] ; VF-TWO-CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4 ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]] ; VF-TWO-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]] @@ -84,28 +84,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-TWO-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]] ; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]] ; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4 ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]] ; VF-TWO-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]] @@ -132,28 +132,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]] ; VF-TWO-CHECK-NEXT: [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]] ; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP108]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP110]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP112]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP114]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP88]], ptr [[TMP116]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP89]], ptr [[TMP118]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP90]], ptr [[TMP120]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP91]], ptr [[TMP122]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP92]], ptr [[TMP124]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP93]], ptr [[TMP126]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP94]], ptr [[TMP128]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP108]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP110]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP112]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP114]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP88]], ptr [[TMP116]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP89]], ptr [[TMP118]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP90]], ptr [[TMP120]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP91]], ptr [[TMP122]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP92]], ptr [[TMP124]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP93]], ptr [[TMP126]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP94]], ptr [[TMP128]], align 4 ; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP95]], ptr [[TMP130]], align 4 ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48 ; VF-TWO-CHECK-NEXT: [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -251,28 +251,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-FOUR-CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP10]] ; VF-FOUR-CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[BB]], i64 [[TMP11]] ; VF-FOUR-CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 0 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 4 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 8 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 12 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 16 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 20 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 24 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 28 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 32 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 36 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP44:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 40 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP46:%.*]] = getelementptr inbounds float, ptr [[TMP12]], i32 44 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP24]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP26]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP28]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP30]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP32]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP34]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x float>, ptr [[TMP36]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP38]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP40]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP42]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP44]], align 4 ; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP46]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds float, ptr [[CC:%.*]], i64 [[TMP0]] ; VF-FOUR-CHECK-NEXT: [[TMP49:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP1]] @@ -287,28 +287,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-FOUR-CHECK-NEXT: [[TMP58:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP10]] ; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[CC]], i64 [[TMP11]] ; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 4 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP64:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 8 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 12 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 16 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP70:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 20 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 24 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 28 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP76:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 32 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 36 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 40 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP82:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 44 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD13:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP62]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD15:%.*]] = load <4 x float>, ptr [[TMP64]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD16:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD17:%.*]] = load <4 x float>, ptr [[TMP68]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD18:%.*]] = load <4 x float>, ptr [[TMP70]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD19:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD20:%.*]] = load <4 x float>, ptr [[TMP74]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD21:%.*]] = load <4 x float>, ptr [[TMP76]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD22:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD23:%.*]] = load <4 x float>, ptr [[TMP80]], align 4 ; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD24:%.*]] = load <4 x float>, ptr [[TMP82]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP84:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD13]] ; VF-FOUR-CHECK-NEXT: [[TMP85:%.*]] = fadd fast <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD14]] @@ -335,28 +335,28 @@ define dso_local void @f1(ptr noalias %aa, ptr noalias %bb, ptr noalias %cc, i32 ; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP10]] ; VF-FOUR-CHECK-NEXT: [[TMP107:%.*]] = getelementptr inbounds float, ptr [[AA]], i64 [[TMP11]] ; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 0 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP108]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 4 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP110]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 8 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP112]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP114:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 12 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP114]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP116:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 16 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP88]], ptr [[TMP116]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP118:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 20 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP89]], ptr [[TMP118]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP120:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 24 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP90]], ptr [[TMP120]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP122:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 28 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP91]], ptr [[TMP122]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP124:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 32 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP92]], ptr [[TMP124]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP126:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 36 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP93]], ptr [[TMP126]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP128:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 40 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP94]], ptr [[TMP128]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP130:%.*]] = getelementptr inbounds float, ptr [[TMP96]], i32 44 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP108]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP110]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP112]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP114]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP88]], ptr [[TMP116]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP89]], ptr [[TMP118]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP90]], ptr [[TMP120]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP91]], ptr [[TMP122]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP92]], ptr [[TMP124]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP93]], ptr [[TMP126]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP94]], ptr [[TMP128]], align 4 ; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP95]], ptr [[TMP130]], align 4 ; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 48 ; VF-FOUR-CHECK-NEXT: [[TMP132:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -524,34 +524,34 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-TWO-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]] ; VF-TWO-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 ; VF-TWO-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4 ; VF-TWO-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8 ; VF-TWO-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12 ; VF-TWO-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16 ; VF-TWO-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20 ; VF-TWO-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24 ; VF-TWO-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3 -; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4 -; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28 ; VF-TWO-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3 +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> +; VF-TWO-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4 +; VF-TWO-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 ; VF-TWO-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> ; VF-TWO-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], @@ -571,20 +571,20 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-TWO-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] ; VF-TWO-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] ; VF-TWO-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP80]], ptr [[TMP96]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP81]], ptr [[TMP98]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP82]], ptr [[TMP100]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP83]], ptr [[TMP102]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP104]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP106]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24 -; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP108]], align 4 ; VF-TWO-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP80]], ptr [[TMP96]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP81]], ptr [[TMP98]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP82]], ptr [[TMP100]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP83]], ptr [[TMP102]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP104]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP106]], align 4 +; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP108]], align 4 ; VF-TWO-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP110]], align 4 ; VF-TWO-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; VF-TWO-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -731,34 +731,34 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-FOUR-CHECK-NEXT: [[TMP55:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP47]] ; VF-FOUR-CHECK-NEXT: [[TMP56:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 0 ; VF-FOUR-CHECK-NEXT: [[TMP57:%.*]] = getelementptr inbounds float, ptr [[TMP56]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP59:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -4 ; VF-FOUR-CHECK-NEXT: [[TMP60:%.*]] = getelementptr inbounds float, ptr [[TMP59]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP62:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -8 ; VF-FOUR-CHECK-NEXT: [[TMP63:%.*]] = getelementptr inbounds float, ptr [[TMP62]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP65:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -12 ; VF-FOUR-CHECK-NEXT: [[TMP66:%.*]] = getelementptr inbounds float, ptr [[TMP65]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP68:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -16 ; VF-FOUR-CHECK-NEXT: [[TMP69:%.*]] = getelementptr inbounds float, ptr [[TMP68]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP71:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -20 ; VF-FOUR-CHECK-NEXT: [[TMP72:%.*]] = getelementptr inbounds float, ptr [[TMP71]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP74:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -24 ; VF-FOUR-CHECK-NEXT: [[TMP75:%.*]] = getelementptr inbounds float, ptr [[TMP74]], i32 -3 -; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4 -; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP77:%.*]] = getelementptr inbounds float, ptr [[TMP48]], i32 -28 ; VF-FOUR-CHECK-NEXT: [[TMP78:%.*]] = getelementptr inbounds float, ptr [[TMP77]], i32 -3 +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP57]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE:%.*]] = shufflevector <4 x float> [[WIDE_LOAD]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP60]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE3:%.*]] = shufflevector <4 x float> [[WIDE_LOAD2]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP63]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE5:%.*]] = shufflevector <4 x float> [[WIDE_LOAD4]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP66]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE7:%.*]] = shufflevector <4 x float> [[WIDE_LOAD6]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP69]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE9:%.*]] = shufflevector <4 x float> [[WIDE_LOAD8]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP72]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE11:%.*]] = shufflevector <4 x float> [[WIDE_LOAD10]], <4 x float> poison, <4 x i32> +; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD12:%.*]] = load <4 x float>, ptr [[TMP75]], align 4 +; VF-FOUR-CHECK-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x float> [[WIDE_LOAD12]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[WIDE_LOAD14:%.*]] = load <4 x float>, ptr [[TMP78]], align 4 ; VF-FOUR-CHECK-NEXT: [[REVERSE15:%.*]] = shufflevector <4 x float> [[WIDE_LOAD14]], <4 x float> poison, <4 x i32> ; VF-FOUR-CHECK-NEXT: [[TMP80:%.*]] = fadd fast <4 x float> [[REVERSE]], @@ -778,20 +778,20 @@ define dso_local signext i32 @f2(ptr noalias %A, ptr noalias %B, i32 signext %n) ; VF-FOUR-CHECK-NEXT: [[TMP94:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP22]] ; VF-FOUR-CHECK-NEXT: [[TMP95:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP23]] ; VF-FOUR-CHECK-NEXT: [[TMP96:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 0 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP80]], ptr [[TMP96]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP98:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 4 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP81]], ptr [[TMP98]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP100:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 8 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP82]], ptr [[TMP100]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP102:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 12 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP83]], ptr [[TMP102]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP104:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 16 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP104]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP106:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 20 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP106]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP108:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 24 -; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP108]], align 4 ; VF-FOUR-CHECK-NEXT: [[TMP110:%.*]] = getelementptr inbounds float, ptr [[TMP88]], i32 28 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP80]], ptr [[TMP96]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP81]], ptr [[TMP98]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP82]], ptr [[TMP100]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP83]], ptr [[TMP102]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP84]], ptr [[TMP104]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP85]], ptr [[TMP106]], align 4 +; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP86]], ptr [[TMP108]], align 4 ; VF-FOUR-CHECK-NEXT: store <4 x float> [[TMP87]], ptr [[TMP110]], align 4 ; VF-FOUR-CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; VF-FOUR-CHECK-NEXT: [[TMP112:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll index a6ab8d1a1f35d..ecf7083ab7fad 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/divrem.ll @@ -67,8 +67,8 @@ define void @vector_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = udiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; FIXED-NEXT: [[TMP7:%.*]] = udiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] @@ -171,8 +171,8 @@ define void @vector_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = sdiv <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; FIXED-NEXT: [[TMP7:%.*]] = sdiv <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] @@ -275,8 +275,8 @@ define void @vector_urem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = urem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; FIXED-NEXT: [[TMP7:%.*]] = urem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] @@ -379,8 +379,8 @@ define void @vector_srem(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = srem <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]] ; FIXED-NEXT: [[TMP7:%.*]] = srem <4 x i64> [[WIDE_LOAD1]], [[BROADCAST_SPLAT]] @@ -493,8 +493,8 @@ define void @predicated_udiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer @@ -627,8 +627,8 @@ define void @predicated_sdiv(ptr noalias nocapture %a, i64 %v, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer ; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[BROADCAST_SPLAT]], zeroinitializer @@ -756,8 +756,8 @@ define void @predicated_udiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], ; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], @@ -883,8 +883,8 @@ define void @predicated_sdiv_by_constant(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8 ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD]], ; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <4 x i64> [[WIDE_LOAD1]], @@ -1011,8 +1011,8 @@ define void @predicated_sdiv_by_minus_one(ptr noalias nocapture %a, i64 %n) { ; FIXED-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 [[TMP0]] ; FIXED-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP1]] ; FIXED-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 0 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 ; FIXED-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 32 +; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[TMP4]], align 1 ; FIXED-NEXT: [[WIDE_LOAD1:%.*]] = load <32 x i8>, ptr [[TMP5]], align 1 ; FIXED-NEXT: [[TMP6:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD]], ; FIXED-NEXT: [[TMP7:%.*]] = icmp ne <32 x i8> [[WIDE_LOAD1]], diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll index 29ef7364b8212..ee7ffe24739b6 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/interleaved-accesses.ll @@ -553,8 +553,8 @@ define void @combine_load_factor2_i32(ptr noalias %p, ptr noalias %q) { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[Q:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[Q]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; CHECK-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 8 +; CHECK-NEXT: store <8 x i32> [[TMP8]], ptr [[TMP12]], align 4 ; CHECK-NEXT: store <8 x i32> [[TMP9]], ptr [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll index cad64f5c7e2be..949987d91e145 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -66,10 +66,12 @@ define void @vector_reverse_i64(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx>, ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%add9> = add ir<%1>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: WIDEN store ir<%arrayidx3>, ir<%add9> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx3>, ir<%arrayidx3> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add9> ; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -203,10 +205,12 @@ define void @vector_reverse_f32(ptr nocapture noundef writeonly %A, ptr nocaptur ; CHECK-NEXT: CLONE ir<%i.0> = add nsw vp<[[STEPS]]>, ir<-1> ; CHECK-NEXT: CLONE ir<%idxprom> = zext ir<%i.0> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%B>, ir<%idxprom> -; CHECK-NEXT: WIDEN ir<%1> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx>, ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%1> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%conv1> = fadd ir<%1>, ir<1.000000e+00> ; CHECK-NEXT: CLONE ir<%arrayidx3> = getelementptr inbounds ir<%A>, ir<%idxprom> -; CHECK-NEXT: WIDEN store ir<%arrayidx3>, ir<%conv1> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx3>, ir<%arrayidx3> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%conv1> ; CHECK-NEXT: EMIT vp<[[IV_INC:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[IV_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll index 89c3219c4b6d0..3749847c9ba10 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/uniform-load-store.ll @@ -68,8 +68,8 @@ define void @uniform_load(ptr noalias nocapture %a, ptr noalias nocapture %b, i6 ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -244,8 +244,8 @@ define i64 @uniform_load_outside_use(ptr noalias nocapture %a, ptr noalias nocap ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -402,8 +402,8 @@ define void @conditional_uniform_load(ptr noalias nocapture %a, ptr noalias noca ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 ; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI]], ptr [[TMP8]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[PREDPHI3]], ptr [[TMP9]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], @@ -628,8 +628,8 @@ define void @uniform_load_unaligned(ptr noalias nocapture %a, ptr noalias nocapt ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP3]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT2]], ptr [[TMP6]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -803,8 +803,8 @@ define void @uniform_store(ptr noalias nocapture %a, ptr noalias nocapture %b, i ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -995,8 +995,8 @@ define void @uniform_store_of_loop_varying(ptr noalias nocapture %a, ptr noalias ; FIXEDLEN-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP4]] ; FIXEDLEN-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 ; FIXEDLEN-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[TMP8]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP10]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP11]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 @@ -1235,8 +1235,8 @@ define void @conditional_uniform_store(ptr noalias nocapture %a, ptr noalias noc ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 ; FIXEDLEN-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[TMP4]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP7]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[STEP_ADD]], @@ -1459,8 +1459,8 @@ define void @uniform_store_unaligned(ptr noalias nocapture %a, ptr noalias nocap ; FIXEDLEN-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; FIXEDLEN-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; FIXEDLEN-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 ; FIXEDLEN-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4 +; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP4]], align 8 ; FIXEDLEN-NEXT: store <4 x i64> [[BROADCAST_SPLAT]], ptr [[TMP5]], align 8 ; FIXEDLEN-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; FIXEDLEN-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 diff --git a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll index f1518189c41a2..77c41453f4863 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/conversion-cost.ll @@ -139,12 +139,12 @@ define i32 @conversion_cost2(i32 %n, ptr nocapture %A, ptr nocapture %B) nounwin ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds float, ptr [[B]], i64 [[TMP7]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 0 -; CHECK-NEXT: store <2 x float> [[TMP12]], ptr [[TMP20]], align 4 ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 2 -; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP21]], align 4 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 4 -; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[TMP22]], align 4 ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i32 6 +; CHECK-NEXT: store <2 x float> [[TMP12]], ptr [[TMP20]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP13]], ptr [[TMP21]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP14]], ptr [[TMP22]], align 4 ; CHECK-NEXT: store <2 x float> [[TMP15]], ptr [[TMP23]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <2 x i64> [[STEP_ADD2]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll index 90dae04210071..c1be67853bf7c 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/epilog-vectorization-inductions.ll @@ -174,12 +174,12 @@ define void @test_induction_step_needs_expansion(ptr noalias %j, ptr %k, i64 %l, ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP5]] ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[K]], i64 [[TMP6]] ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP15]], align 2 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 16 -; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP16]], align 2 ; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 32 -; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP17]], align 2 ; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[TMP11]], i32 48 +; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP15]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP16]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP17]], align 2 ; CHECK-NEXT: store <16 x i16> [[TMP10]], ptr [[TMP18]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <16 x i16> [[STEP_ADD5]], [[DOTSPLAT3]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll index a96cfa7cd99e7..8004563f38165 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/fixed-order-recurrence.ll @@ -30,8 +30,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD1]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD1]], <16 x i32> @@ -40,8 +40,8 @@ define void @firstorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16 +; CHECK-NEXT: store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP10]], ptr [[TMP14]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -129,8 +129,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i32 16 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1 ; CHECK-NEXT: [[WIDE_LOAD5]] = load <16 x i8>, ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i8> [[VECTOR_RECUR]], <16 x i8> [[WIDE_LOAD]], <16 x i32> ; CHECK-NEXT: [[TMP8]] = shufflevector <16 x i8> [[WIDE_LOAD]], <16 x i8> [[WIDE_LOAD5]], <16 x i32> @@ -147,8 +147,8 @@ define void @thirdorderrec(ptr nocapture noundef readonly %x, ptr noalias nocapt ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[Y:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 0 -; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1 ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i32 16 +; CHECK-NEXT: store <16 x i8> [[TMP17]], ptr [[TMP21]], align 1 ; CHECK-NEXT: store <16 x i8> [[TMP18]], ptr [[TMP22]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll index baeaef0b67b04..4d28f4ee6280a 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll @@ -36,12 +36,12 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 { ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], ; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], ptr [[TMP1]], align 4 ; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8 -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], ptr [[TMP2]], align 4 ; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 16 -; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], ptr [[TMP3]], align 4 ; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 24 +; AUTO_VEC-NEXT: store <8 x float> [[VEC_IND]], ptr [[TMP1]], align 4 +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD]], ptr [[TMP2]], align 4 +; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD2]], ptr [[TMP3]], align 4 ; AUTO_VEC-NEXT: store <8 x float> [[STEP_ADD3]], ptr [[TMP4]], align 4 ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <8 x float> [[VEC_IND]], @@ -211,12 +211,12 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) { ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], ; AUTO_VEC-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]] -; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8 ; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i64 4 -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8 ; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 8 -; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8 ; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr double, ptr [[TMP1]], i64 12 +; AUTO_VEC-NEXT: store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8 +; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8 ; AUTO_VEC-NEXT: store <4 x double> [[STEP_ADD3]], ptr [[TMP4]], align 8 ; AUTO_VEC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AUTO_VEC-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x double> [[VEC_IND]], @@ -377,12 +377,12 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) { ; AUTO_VEC-NEXT: [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], ; AUTO_VEC-NEXT: [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], ; AUTO_VEC-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 [[INDEX]] -; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 ; AUTO_VEC-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 8 -; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 ; AUTO_VEC-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 16 -; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 ; AUTO_VEC-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 24 +; AUTO_VEC-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4 +; AUTO_VEC-NEXT: [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4 +; AUTO_VEC-NEXT: [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4 ; AUTO_VEC-NEXT: [[WIDE_LOAD7:%.*]] = load <8 x float>, ptr [[TMP5]], align 4 ; AUTO_VEC-NEXT: [[TMP6:%.*]] = fadd reassoc <8 x float> [[VEC_IND]], [[WIDE_LOAD]] ; AUTO_VEC-NEXT: [[TMP7:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], [[WIDE_LOAD5]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll index 24d9e824f63a1..919f7487eb93e 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll @@ -24,12 +24,12 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; SSE-NEXT: [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> ; SSE-NEXT: [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> -; SSE-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] -; SSE-NEXT: [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] -; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; SSE-NEXT: store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4 -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i64 4 -; SSE-NEXT: store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4 +; SSE-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]] +; SSE-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]] +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; SSE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 4 +; SSE-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4 +; SSE-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4 ; SSE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SSE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; SSE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -72,18 +72,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX1-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> ; AVX1-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> -; AVX1-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX1-NEXT: [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX1-NEXT: [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX1-NEXT: [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] -; AVX1-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; AVX1-NEXT: store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4 -; AVX1-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 4 -; AVX1-NEXT: store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4 -; AVX1-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8 -; AVX1-NEXT: store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4 -; AVX1-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 12 -; AVX1-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4 +; AVX1-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] +; AVX1-NEXT: [[TMP16:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] +; AVX1-NEXT: [[TMP17:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] +; AVX1-NEXT: [[TMP18:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] +; AVX1-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; AVX1-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4 +; AVX1-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8 +; AVX1-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 12 +; AVX1-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP21]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP23]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP18]], ptr [[TMP25]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; AVX1-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] @@ -126,18 +126,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) { ; AVX2-NEXT: [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> ; AVX2-NEXT: [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> -; AVX2-NEXT: [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] -; AVX2-NEXT: [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] -; AVX2-NEXT: [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] -; AVX2-NEXT: [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] -; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; AVX2-NEXT: store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4 -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 8 -; AVX2-NEXT: store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4 -; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 16 -; AVX2-NEXT: store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4 -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 24 -; AVX2-NEXT: store <8 x i32> [[TMP14]], ptr [[TMP18]], align 4 +; AVX2-NEXT: [[TMP15:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]] +; AVX2-NEXT: [[TMP16:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]] +; AVX2-NEXT: [[TMP17:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]] +; AVX2-NEXT: [[TMP18:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8 +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 16 +; AVX2-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 24 +; AVX2-NEXT: store <8 x i32> [[TMP15]], ptr [[TMP19]], align 4 +; AVX2-NEXT: store <8 x i32> [[TMP16]], ptr [[TMP21]], align 4 +; AVX2-NEXT: store <8 x i32> [[TMP17]], ptr [[TMP23]], align 4 +; AVX2-NEXT: store <8 x i32> [[TMP18]], ptr [[TMP25]], align 4 ; AVX2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32 ; AVX2-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 ; AVX2-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll index 5111e78bf8bcd..8527fb433c638 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll @@ -36,23 +36,23 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP2:%.*]] = shl i64 [[INDEX]], 1 ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2 ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <16 x i16>, ptr [[TMP5]], align 2 ; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD]], <16 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16 -; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 48 +; CHECK-NEXT: store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2 +; CHECK-NEXT: store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2 ; CHECK-NEXT: store <16 x i16> [[TMP9]], ptr [[TMP12]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -167,23 +167,23 @@ define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocaptur ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[INDEX]] ; CHECK-NEXT: [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32 -; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64 -; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <32 x i8>, ptr [[NEXT_GEP]], align 2 +; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <32 x i8>, ptr [[TMP1]], align 2 +; CHECK-NEXT: [[WIDE_LOAD10:%.*]] = load <32 x i8>, ptr [[TMP2]], align 2 ; CHECK-NEXT: [[WIDE_LOAD11:%.*]] = load <32 x i8>, ptr [[TMP3]], align 2 ; CHECK-NEXT: [[TMP4:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD]], <32 x i8> [[WIDE_LOAD]], <32 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP5:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD9]], <32 x i8> [[WIDE_LOAD9]], <32 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP6:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD10]], <32 x i8> [[WIDE_LOAD10]], <32 x i8> [[BROADCAST_SPLAT]]) ; CHECK-NEXT: [[TMP7:%.*]] = call <32 x i8> @llvm.fshl.v32i8(<32 x i8> [[WIDE_LOAD11]], <32 x i8> [[WIDE_LOAD11]], <32 x i8> [[BROADCAST_SPLAT]]) -; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32 -; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64 -; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96 +; CHECK-NEXT: store <32 x i8> [[TMP4]], ptr [[NEXT_GEP5]], align 2 +; CHECK-NEXT: store <32 x i8> [[TMP5]], ptr [[TMP8]], align 2 +; CHECK-NEXT: store <32 x i8> [[TMP6]], ptr [[TMP9]], align 2 ; CHECK-NEXT: store <32 x i8> [[TMP7]], ptr [[TMP10]], align 2 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 128 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll index 995973a645233..292ab4e4b2c4d 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll @@ -36,12 +36,12 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b) ; CHECK-NEXT: [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16 -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope !0 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <16 x i32>, ptr [[TMP4]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP5]] = add <16 x i32> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP6]] = add <16 x i32> [[VEC_PHI4]], [[WIDE_LOAD7]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll index 09be042140fd8..19ac52cf0d0b3 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/load-deref-pred.ll @@ -47,12 +47,12 @@ define i32 @test_explicit_pred(i64 %len) { ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP2]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = getelementptr i32, ptr [[TMP8]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP12]], align 4 +; CHECK-NEXT: [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4 +; CHECK-NEXT: [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4 ; CHECK-NEXT: [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP15]], align 4 ; CHECK-NEXT: [[TMP16:%.*]] = xor <4 x i1> [[TMP4]], ; CHECK-NEXT: [[TMP17:%.*]] = xor <4 x i1> [[TMP5]], @@ -207,12 +207,12 @@ define i32 @test_explicit_pred_generic(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -848,12 +848,12 @@ define i32 @test_max_trip_count(i64 %len, ptr %test_base, i64 %n) { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP40]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP48]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP56]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP72]], i32 4, <4 x i1> [[TMP64]], <4 x i32> poison) ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], ; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], @@ -1015,12 +1015,12 @@ define i32 @test_non_zero_start(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP68]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -1423,12 +1423,12 @@ define i32 @neg_off_by_many(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -1583,12 +1583,12 @@ define i32 @neg_off_by_one_iteration(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -1743,12 +1743,12 @@ define i32 @neg_off_by_one_byte(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -1912,12 +1912,12 @@ define i32 @test_constant_max(i64 %len, ptr %test_base) { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP9]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[ALLOCA]], i64 [[TMP13]] ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP65]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP65]], i32 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP65]], i32 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[TMP72:%.*]] = getelementptr i32, ptr [[TMP65]], i32 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP69]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP70]], align 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP71]], align 4 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP72]], align 4 ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP40]], ; CHECK-NEXT: [[TMP74:%.*]] = xor <4 x i1> [[TMP48]], @@ -2080,12 +2080,12 @@ define i32 @test_allocsize(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -2241,12 +2241,12 @@ define i32 @test_allocsize_array(i64 %len, ptr %test_base) nofree nosync { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], @@ -2412,12 +2412,12 @@ define i32 @test_allocsize_cond_deref(i1 %allzero, ptr %test_base) { ; CHECK-NEXT: [[TMP66:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP8]] ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr i32, ptr [[ALLOCATION]], i64 [[TMP12]] ; CHECK-NEXT: [[TMP68:%.*]] = getelementptr i32, ptr [[TMP64]], i32 0 -; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) ; CHECK-NEXT: [[TMP69:%.*]] = getelementptr i32, ptr [[TMP64]], i32 4 -; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) ; CHECK-NEXT: [[TMP70:%.*]] = getelementptr i32, ptr [[TMP64]], i32 8 -; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[TMP71:%.*]] = getelementptr i32, ptr [[TMP64]], i32 12 +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP68]], i32 4, <4 x i1> [[TMP39]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD4:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP69]], i32 4, <4 x i1> [[TMP47]], <4 x i32> poison) +; CHECK-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP70]], i32 4, <4 x i1> [[TMP55]], <4 x i32> poison) ; CHECK-NEXT: [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0(ptr [[TMP71]], i32 4, <4 x i1> [[TMP63]], <4 x i32> poison) ; CHECK-NEXT: [[TMP72:%.*]] = xor <4 x i1> [[TMP39]], ; CHECK-NEXT: [[TMP73:%.*]] = xor <4 x i1> [[TMP47]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll index 40ac668c6e599..d775b0e0f0199 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/masked_load_store.ll @@ -1391,24 +1391,24 @@ define void @foo6(ptr nocapture readonly %in, ptr nocapture %out, i32 %size, ptr ; AVX2-NEXT: [[TMP21:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP1]] ; AVX2-NEXT: [[TMP22:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP2]] ; AVX2-NEXT: [[TMP23:%.*]] = getelementptr double, ptr [[IN]], i64 [[TMP3]] +; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> +; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[TMP24:%.*]] = getelementptr double, ptr [[TMP20]], i32 0 ; AVX2-NEXT: [[TMP25:%.*]] = getelementptr double, ptr [[TMP24]], i32 -3 -; AVX2-NEXT: [[REVERSE12:%.*]] = shufflevector <4 x i1> [[TMP16]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP25]], i32 8, <4 x i1> [[REVERSE12]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE13:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[TMP20]], i32 -4 ; AVX2-NEXT: [[TMP27:%.*]] = getelementptr double, ptr [[TMP26]], i32 -3 -; AVX2-NEXT: [[REVERSE14:%.*]] = shufflevector <4 x i1> [[TMP17]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD15:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP27]], i32 8, <4 x i1> [[REVERSE14]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE16:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD15]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP28:%.*]] = getelementptr double, ptr [[TMP20]], i32 -8 ; AVX2-NEXT: [[TMP29:%.*]] = getelementptr double, ptr [[TMP28]], i32 -3 -; AVX2-NEXT: [[REVERSE17:%.*]] = shufflevector <4 x i1> [[TMP18]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD18:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP29]], i32 8, <4 x i1> [[REVERSE17]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE19:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD18]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP30:%.*]] = getelementptr double, ptr [[TMP20]], i32 -12 ; AVX2-NEXT: [[TMP31:%.*]] = getelementptr double, ptr [[TMP30]], i32 -3 -; AVX2-NEXT: [[REVERSE20:%.*]] = shufflevector <4 x i1> [[TMP19]], <4 x i1> poison, <4 x i32> ; AVX2-NEXT: [[WIDE_MASKED_LOAD21:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP31]], i32 8, <4 x i1> [[REVERSE20]], <4 x double> poison), !alias.scope !21 ; AVX2-NEXT: [[REVERSE22:%.*]] = shufflevector <4 x double> [[WIDE_MASKED_LOAD21]], <4 x double> poison, <4 x i32> ; AVX2-NEXT: [[TMP32:%.*]] = fadd <4 x double> [[REVERSE13]], diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll index 3a5db926082f0..4472ae4fecd38 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll @@ -30,20 +30,20 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 4 -; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 8 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope !0 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope !0 ; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope !0 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX]] -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 4 -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 8 -; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 12 +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope !3, !noalias !0 +; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope !3, !noalias !0 ; CHECK-NEXT: store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope !3, !noalias !0 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll index 36dee12e4d7d3..a49090eecac41 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr35432.ll @@ -76,8 +76,8 @@ define i32 @main(ptr %ptr) { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i32 [[TMP20]] ; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i32 [[TMP21]] ; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 0 -; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4 ; CHECK-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP22]], i32 4 +; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP24]], align 4 ; CHECK-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP25]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] @@ -98,7 +98,7 @@ define i32 @main(ptr %ptr) { ; CHECK-NEXT: store i32 0, ptr [[GEP]], align 4 ; CHECK-NEXT: [[CONV5:%.*]] = zext i8 [[DEC]] to i32 ; CHECK-NEXT: [[CMP6:%.*]] = icmp ult i32 [[TMP0]], [[CONV5]] -; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[CMP6]], label [[FOR_BODY8]], label [[FOR_COND4_FOR_INC9_CRIT_EDGE]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: for.cond4.for.inc9_crit_edge: ; CHECK-NEXT: [[INC_LCSSA:%.*]] = phi i32 [ [[INC]], [[FOR_BODY8]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: store i32 [[INC_LCSSA]], ptr @a, align 16 diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll index 7ace7fd2c099d..88a58bed39df9 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/pr47437.ll @@ -135,8 +135,8 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; SSE41-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[D1:%.*]], i64 [[TMP0]] ; SSE41-NEXT: [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP1]] ; SSE41-NEXT: [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 0 -; SSE41-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4 ; SSE41-NEXT: [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP26]], i32 4 +; SSE41-NEXT: store <4 x i32> [[TMP24]], ptr [[TMP28]], align 4 ; SSE41-NEXT: store <4 x i32> [[TMP25]], ptr [[TMP29]], align 4 ; SSE41-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; SSE41-NEXT: [[TMP30:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -271,12 +271,12 @@ define void @test_muladd(ptr noalias nocapture %d1, ptr noalias nocapture readon ; AVX1-NEXT: [[TMP54:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP2]] ; AVX1-NEXT: [[TMP55:%.*]] = getelementptr inbounds i32, ptr [[D1]], i64 [[TMP3]] ; AVX1-NEXT: [[TMP56:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 0 -; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 ; AVX1-NEXT: [[TMP57:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 4 -; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 ; AVX1-NEXT: [[TMP58:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 8 -; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4 ; AVX1-NEXT: [[TMP59:%.*]] = getelementptr inbounds i32, ptr [[TMP52]], i32 12 +; AVX1-NEXT: store <4 x i32> [[TMP48]], ptr [[TMP56]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP49]], ptr [[TMP57]], align 4 +; AVX1-NEXT: store <4 x i32> [[TMP50]], ptr [[TMP58]], align 4 ; AVX1-NEXT: store <4 x i32> [[TMP51]], ptr [[TMP59]], align 4 ; AVX1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 ; AVX1-NEXT: [[TMP60:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll index 6c2cb100311eb..804bcb54b4ba5 100644 --- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -64,8 +64,8 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6]] = fadd fast <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP7]] = fadd fast <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] @@ -88,7 +88,7 @@ define float @reduction_sum_float_fastmath(i32 %n, ptr %array) { ; CHECK-NEXT: [[SUM_INC]] = fadd fast float [[SUM]], [[VALUE]] ; CHECK-NEXT: [[IDX_INC]] = add i32 [[IDX]], 1 ; CHECK-NEXT: [[BE_COND:%.*]] = icmp ne i32 [[IDX_INC]], 4096 -; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[BE_COND]], label [[LOOP]], label [[LOOP_EXIT_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: loop.exit.loopexit: ; CHECK-NEXT: [[SUM_INC_LCSSA:%.*]] = phi float [ [[SUM_INC]], [[LOOP]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; CHECK-NEXT: br label [[LOOP_EXIT]] @@ -133,8 +133,8 @@ define float @reduction_sum_float_only_reassoc(i32 %n, ptr %array) { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6]] = fadd reassoc <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP7]] = fadd reassoc <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] @@ -202,8 +202,8 @@ define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, ptr %array) ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr float, ptr [[ARRAY:%.*]], i32 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr float, ptr [[ARRAY]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6]] = fadd reassoc contract <4 x float> [[VEC_PHI]], [[WIDE_LOAD]] ; CHECK-NEXT: [[TMP7]] = fadd reassoc contract <4 x float> [[VEC_PHI1]], [[WIDE_LOAD2]] @@ -278,8 +278,8 @@ define float @PR35538(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf nsz oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] @@ -363,8 +363,8 @@ define float @PR35538_more_FMF(ptr nocapture readonly %a, i32 %N) #0 { ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i32 4 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP4]], align 4 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x float>, ptr [[TMP5]], align 4 ; CHECK-NEXT: [[TMP6:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD]], [[VEC_PHI]] ; CHECK-NEXT: [[TMP7:%.*]] = fcmp nnan ninf oge <4 x float> [[WIDE_LOAD2]], [[VEC_PHI1]] diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll index 23945f9df0afb..04b59475ca4bc 100644 --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-const-TC.ll @@ -21,10 +21,10 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize { ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 -; CHECK-NEXT: store <2 x i32> , ptr [[TMP6]], align 1 ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 -; CHECK-NEXT: store <2 x i32> , ptr [[TMP7]], align 1 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP6]], align 1 +; CHECK-NEXT: store <2 x i32> , ptr [[TMP7]], align 1 ; CHECK-NEXT: store <2 x i32> , ptr [[TMP8]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 6 ; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1800 @@ -40,7 +40,7 @@ define dso_local void @constTC(ptr noalias nocapture %A) optsize { ; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 1800 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll index 8ff542a36732d..85243988b7eea 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll @@ -18,11 +18,13 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) { ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.2> = phi ir<33>, vp<[[FOR1_SPLICE:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%for.1.next> = load ir<%gep.ptr> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.ptr> +; CHECK-NEXT: WIDEN ir<%for.1.next> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: EMIT vp<[[FOR1_SPLICE]]> = first-order splice ir<%for.1>, ir<%for.1.next> ; CHECK-NEXT: EMIT vp<[[FOR2_SPLICE:%.+]]> = first-order splice ir<%for.2>, vp<[[FOR1_SPLICE]]> ; CHECK-NEXT: WIDEN ir<%add> = add vp<[[FOR1_SPLICE]]>, vp<[[FOR2_SPLICE]]> -; CHECK-NEXT: WIDEN store ir<%gep.ptr>, ir<%add> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%gep.ptr> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors @@ -69,13 +71,15 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) { ; CHECK-NEXT: FIRST-ORDER-RECURRENCE-PHI ir<%for.3> = phi ir<33>, vp<[[FOR2_SPLICE:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.ptr> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%for.1.next> = load ir<%gep.ptr> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.ptr> +; CHECK-NEXT: WIDEN ir<%for.1.next> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: EMIT vp<[[FOR1_SPLICE]]> = first-order splice ir<%for.1>, ir<%for.1.next> ; CHECK-NEXT: EMIT vp<[[FOR2_SPLICE]]> = first-order splice ir<%for.2>, vp<[[FOR1_SPLICE]]> ; CHECK-NEXT: EMIT vp<[[FOR3_SPLICE:%.+]]> = first-order splice ir<%for.3>, vp<[[FOR2_SPLICE]]> ; CHECK-NEXT: WIDEN ir<%add.1> = add vp<[[FOR1_SPLICE]]>, vp<[[FOR2_SPLICE]]> ; CHECK-NEXT: WIDEN ir<%add.2> = add ir<%add.1>, vp<[[FOR3_SPLICE]]> -; CHECK-NEXT: WIDEN store ir<%gep.ptr>, ir<%add.2> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%gep.ptr> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add.2> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VTC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll index 7b242f565824b..6d6d064252017 100644 --- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -38,8 +38,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]] ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP9]], align 4 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD1]] = load <4 x i32>, ptr [[TMP10]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD1]], <4 x i32> @@ -48,8 +48,8 @@ define void @recurrence_1(ptr readonly noalias %a, ptr noalias %b, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = add <4 x i32> [[WIDE_LOAD]], [[TMP11]] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = add <4 x i32> [[WIDE_LOAD1]], [[TMP12]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP13]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -246,8 +246,8 @@ define i32 @recurrence_2(ptr nocapture readonly %a, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 4 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2]] = load <4 x i32>, ptr [[TMP6]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[WIDE_LOAD]], <4 x i32> [[WIDE_LOAD2]], <4 x i32> @@ -507,8 +507,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i16, ptr [[TMP5]], i32 4 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP7]], align 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP8]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> @@ -523,8 +523,8 @@ define void @recurrence_3(ptr readonly noalias %a, ptr noalias %b, i32 %n, float ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds double, ptr [[B]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP17]], ptr [[TMP21]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = getelementptr inbounds double, ptr [[TMP19]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP17]], ptr [[TMP21]], align 8 ; UNROLL-NO-IC-NEXT: store <4 x double> [[TMP18]], ptr [[TMP22]], align 8 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -1818,8 +1818,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 4 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP7]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> @@ -1832,8 +1832,8 @@ define void @sink_after(ptr noalias %a, ptr noalias %b, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2041,8 +2041,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP6]], i64 1 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds [2 x i16], ptr [[A]], i64 [[TMP7]], i64 1 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> , ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> , ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> , ptr [[TMP19]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = load i16, ptr [[TMP10]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = load i16, ptr [[TMP11]], align 2 @@ -2071,8 +2071,8 @@ define void @PR34711(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP44:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP46:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP42]], ptr [[TMP46]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP47:%.*]] = getelementptr inbounds i32, ptr [[TMP44]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP42]], ptr [[TMP46]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP43]], ptr [[TMP47]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP48:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2284,8 +2284,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64 ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i32 4 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i16>, ptr [[TMP6]], align 2 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD1]] = load <4 x i16>, ptr [[TMP7]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[VECTOR_RECUR]], <4 x i16> [[WIDE_LOAD]], <4 x i32> ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = shufflevector <4 x i16> [[WIDE_LOAD]], <4 x i16> [[WIDE_LOAD1]], <4 x i32> @@ -2300,8 +2300,8 @@ define void @sink_after_with_multiple_users(ptr noalias %a, ptr noalias %b, i64 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP20]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP18]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP16]], ptr [[TMP20]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP21]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] @@ -2601,8 +2601,8 @@ define void @sink_dead_inst(ptr %a) { ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr i16, ptr [[A:%.*]], i16 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr i16, ptr [[A]], i16 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr i16, ptr [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i16> [[TMP12]], ptr [[TMP16]], align 2 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr i16, ptr [[TMP14]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i16> [[TMP12]], ptr [[TMP16]], align 2 ; UNROLL-NO-IC-NEXT: store <4 x i16> [[TMP13]], ptr [[TMP17]], align 2 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], @@ -3514,8 +3514,8 @@ define i32 @sink_after_dead_inst(ptr %A.ptr) { ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[A_PTR:%.*]], i16 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 -; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP10]], i32 4 +; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: store <4 x i32> zeroinitializer, ptr [[TMP13]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <4 x i16> [[STEP_ADD]], diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 44f3c3db26d95..86f90fc75d645 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -102,8 +102,8 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N) ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] @@ -349,8 +349,8 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32 ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]] ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fsub reassoc <4 x float> [[STEP_ADD]], [[DOTSPLAT5]] @@ -586,8 +586,8 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 { ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], @@ -859,8 +859,8 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]] ; VEC4_INTERL2-NEXT: [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4 ; VEC4_INTERL2-NEXT: [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]] ; VEC4_INTERL2-NEXT: [[TMP9:%.*]] = fadd fast <4 x float> [[STEP_ADD11]], [[BROADCAST_SPLAT]] @@ -869,12 +869,12 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca ; VEC4_INTERL2-NEXT: [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]] ; VEC4_INTERL2-NEXT: [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP9]] ; VEC4_INTERL2-NEXT: [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP12]], ptr [[TMP14]], align 4 ; VEC4_INTERL2-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP12]], ptr [[TMP14]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[TMP13]], ptr [[TMP15]], align 4 ; VEC4_INTERL2-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[TMP10]], ptr [[TMP16]], align 4 ; VEC4_INTERL2-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[TMP10]], ptr [[TMP16]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[TMP11]], ptr [[TMP17]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], @@ -1167,8 +1167,8 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) { ; VEC4_INTERL2-NEXT: [[VEC_IND:%.*]] = phi <4 x float> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; VEC4_INTERL2-NEXT: [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4 +; VEC4_INTERL2-NEXT: store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4 ; VEC4_INTERL2-NEXT: store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4 ; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; VEC4_INTERL2-NEXT: [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], @@ -1403,8 +1403,8 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) { ; VEC4_INTERL2-NEXT: [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float ; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = or i64 [[INDEX]], 4 ; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]] -; VEC4_INTERL2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4 +; VEC4_INTERL2-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4 ; VEC4_INTERL2-NEXT: [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP2]], align 4 ; VEC4_INTERL2-NEXT: [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer ; VEC4_INTERL2-NEXT: [[TMP4:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD3]], zeroinitializer diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll index 2df55bdf89a00..cbae19f19ec16 100644 --- a/llvm/test/Transforms/LoopVectorize/induction.ll +++ b/llvm/test/Transforms/LoopVectorize/induction.ll @@ -112,8 +112,8 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; UNROLL-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -161,8 +161,8 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP3]] ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP4]] ; UNROLL-NO-IC-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP5]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP7]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP8]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -205,8 +205,8 @@ define void @multi_int_induction(ptr %A, i32 %N) { ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -410,13 +410,13 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]] -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] ; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]] -; UNROLL-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope !7 ; UNROLL-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope !7 ; UNROLL-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope !7 ; UNROLL-NEXT: [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] ; UNROLL-NEXT: [[TMP12:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] @@ -483,16 +483,16 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP7]] ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP8]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP11]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP12]], align 4, !alias.scope !4, !noalias !7 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = add i64 [[TMP5]], [[OFFSET2]] ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = add i64 [[TMP6]], [[OFFSET2]] ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP13]] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[TMP14]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope !7 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP15]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP17]], align 4, !alias.scope !7 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP18]], align 4, !alias.scope !7 ; UNROLL-NO-IC-NEXT: [[TMP19:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] ; UNROLL-NO-IC-NEXT: [[TMP20:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] @@ -553,13 +553,13 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) { ; INTERLEAVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]] -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 ; INTERLEAVE-NEXT: [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope !4, !noalias !7 ; INTERLEAVE-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !alias.scope !4, !noalias !7 ; INTERLEAVE-NEXT: [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]] ; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]] -; INTERLEAVE-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !7 ; INTERLEAVE-NEXT: [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope !7 ; INTERLEAVE-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope !7 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]] ; INTERLEAVE-NEXT: [[TMP12:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD6]] @@ -718,8 +718,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) { ; UNROLL-NEXT: [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 ; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8 ; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8 ; UNROLL-NEXT: [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; UNROLL-NEXT: [[TMP3]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] @@ -766,8 +766,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP4]], align 8 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8 ; UNROLL-NO-IC-NEXT: [[TMP6]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; UNROLL-NO-IC-NEXT: [[TMP7]] = add <2 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] @@ -809,8 +809,8 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) { ; INTERLEAVE-NEXT: [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8 ; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8 ; INTERLEAVE-NEXT: [[TMP2]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]] ; INTERLEAVE-NEXT: [[TMP3]] = add <4 x i64> [[WIDE_LOAD2]], [[VEC_PHI1]] @@ -2117,8 +2117,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 2 ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] -; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2 +; UNROLL-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4 ; UNROLL-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; UNROLL: pred.udiv.if: @@ -2210,8 +2210,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[BROADCAST_SPLAT]], i32 0 ; UNROLL-NO-IC-NEXT: br i1 [[TMP6]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] @@ -2305,8 +2305,8 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) { ; INTERLEAVE-NEXT: [[TMP0:%.*]] = or i32 [[INDEX]], 4 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[INDEX]] to i64 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] -; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4 +; INTERLEAVE-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] ; INTERLEAVE: pred.udiv.if: @@ -3607,8 +3607,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] ; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -3683,8 +3683,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; UNROLL-NO-IC-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP12]] ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP13]] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP16]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -3751,8 +3751,8 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] ; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -3995,8 +3995,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] ; UNROLL-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; UNROLL-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -4074,8 +4074,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; UNROLL-NO-IC-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i8 [[TMP13]] ; UNROLL-NO-IC-NEXT: [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], i8 [[TMP14]] ; UNROLL-NO-IC-NEXT: [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP17]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP18]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -4145,8 +4145,8 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) { ; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]] ; INTERLEAVE-NEXT: [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; INTERLEAVE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -4293,8 +4293,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -4334,8 +4334,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -4370,8 +4370,8 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) { ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP0:%.*]] = sext i32 [[INDEX]] to i64 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -4511,8 +4511,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; UNROLL-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; UNROLL-NEXT: [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32 ; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -4562,8 +4562,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; UNROLL-NO-IC-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP7]] ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP10]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP11]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -4603,8 +4603,8 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) { ; INTERLEAVE-NEXT: [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32 ; INTERLEAVE-NEXT: [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -4748,8 +4748,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] ; UNROLL-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; UNROLL-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -4795,8 +4795,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]] ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP6]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -4837,8 +4837,8 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) { ; INTERLEAVE-NEXT: [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]] ; INTERLEAVE-NEXT: [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64 ; INTERLEAVE-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -4970,8 +4970,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) { ; UNROLL-NEXT: [[VEC_IND:%.*]] = phi <2 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NEXT: [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], ; UNROLL-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 ; UNROLL-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 ; UNROLL-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -5016,8 +5016,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) { ; UNROLL-NO-IC-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]] ; UNROLL-NO-IC-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP1]] ; UNROLL-NO-IC-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[VEC_IND]], ptr [[TMP4]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[STEP_ADD]], ptr [[TMP5]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -5057,8 +5057,8 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) { ; INTERLEAVE-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; INTERLEAVE-NEXT: [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], ; INTERLEAVE-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 ; INTERLEAVE-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -6050,8 +6050,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr % ; UNROLL-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] ; UNROLL-NEXT: [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]] ; UNROLL-NEXT: [[TMP8:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP4]] -; UNROLL-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4 ; UNROLL-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4 ; UNROLL-NEXT: store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], @@ -6091,8 +6091,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr % ; UNROLL-NO-IC-NEXT: [[TMP10:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP6]] ; UNROLL-NO-IC-NEXT: [[TMP11:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP7]] ; UNROLL-NO-IC-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP8]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP13:%.*]] = getelementptr i32, ptr [[TMP8]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP10]], ptr [[TMP12]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP11]], ptr [[TMP13]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], @@ -6145,8 +6145,8 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr % ; INTERLEAVE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]] ; INTERLEAVE-NEXT: [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]] ; INTERLEAVE-NEXT: [[TMP8:%.*]] = add <4 x i32> [[STEP_ADD]], [[TMP4]] -; INTERLEAVE-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4 ; INTERLEAVE-NEXT: [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], @@ -6396,8 +6396,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NEXT: [[TMP17:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> ; UNROLL-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> ; UNROLL-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]] -; UNROLL-NEXT: store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; UNROLL-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 2 +; UNROLL-NEXT: store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; UNROLL-NEXT: store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4 ; UNROLL-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]] @@ -6479,8 +6479,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; UNROLL-NO-IC-NEXT: [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[TMP19]] ; UNROLL-NO-IC-NEXT: [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[PTR]], i64 [[TMP20]] ; UNROLL-NO-IC-NEXT: [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 0 -; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4 ; UNROLL-NO-IC-NEXT: [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i32 2 +; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP21]], ptr [[TMP25]], align 4 ; UNROLL-NO-IC-NEXT: store <2 x i32> [[TMP22]], ptr [[TMP26]], align 4 ; UNROLL-NO-IC-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; UNROLL-NO-IC-NEXT: [[VEC_IND_NEXT]] = add <2 x i32> [[STEP_ADD]], [[DOTSPLAT3]] @@ -6554,8 +6554,8 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n ; INTERLEAVE-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP18:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> ; INTERLEAVE-NEXT: [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]] -; INTERLEAVE-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; INTERLEAVE-NEXT: [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4 +; INTERLEAVE-NEXT: store <4 x i32> [[TMP17]], ptr [[TMP19]], align 4 ; INTERLEAVE-NEXT: store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4 ; INTERLEAVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 ; INTERLEAVE-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[STEP_ADD]], [[DOTSPLAT3]] diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll index 393490ad862b4..5410d5d2f6471 100644 --- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll @@ -503,27 +503,27 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-NEXT: [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]] -; CHECK-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 -; CHECK-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT9]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT11:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i64 0 -; CHECK-NEXT: [[DOTSPLAT12:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT11]], <4 x i8> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i8> , [[DOTSPLAT12]] -; CHECK-NEXT: [[INDUCTION13:%.*]] = add <4 x i8> [[DOTSPLAT10]], [[TMP8]] +; CHECK-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <4 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT8]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <4 x i8> poison, i8 [[INDUCTION_IV]], i64 0 +; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT10]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = mul <4 x i8> , [[DOTSPLAT11]] +; CHECK-NEXT: [[INDUCTION12:%.*]] = add <4 x i8> [[DOTSPLAT9]], [[TMP8]] ; CHECK-NEXT: [[TMP9:%.*]] = mul i8 [[INDUCTION_IV]], 4 -; CHECK-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0 -; CHECK-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT14]], <4 x i8> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[DOTSPLATINSERT13:%.*]] = insertelement <4 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-NEXT: [[DOTSPLAT14:%.*]] = shufflevector <4 x i8> [[DOTSPLATINSERT13]], <4 x i8> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK: vec.epilog.vector.body: -; CHECK-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_IND16:%.*]] = phi <4 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-NEXT: [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]] -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX18]], 0 +; CHECK-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_IND15:%.*]] = phi <4 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-NEXT: [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]] +; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX17]], 0 ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-NEXT: store <4 x i8> [[VEC_IND16]], ptr [[TMP12]], align 1 -; CHECK-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT17]] = add <4 x i8> [[VEC_IND16]], [[DOTSPLAT15]] -; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84 +; CHECK-NEXT: store <4 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1 +; CHECK-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT16]] = add <4 x i8> [[VEC_IND15]], [[DOTSPLAT14]] +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84 ; CHECK-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; CHECK: vec.epilog.middle.block: ; CHECK-NEXT: br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] @@ -593,27 +593,27 @@ define void @induction_resume_value_requires_non_trivial_scev_expansion(ptr %dst ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[IND_END]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 84, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[IND_END4:%.*]] = mul i8 84, [[INDUCTION_IV]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT9:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT10:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT9]], <2 x i8> poison, <2 x i32> zeroinitializer -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT11:%.*]] = insertelement <2 x i8> poison, i8 [[INDUCTION_IV]], i64 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT12:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT11]], <2 x i8> poison, <2 x i32> zeroinitializer -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = mul <2 x i8> , [[DOTSPLAT12]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION13:%.*]] = add <2 x i8> [[DOTSPLAT10]], [[TMP8]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[BC_RESUME_VAL]], i64 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT9:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[INDUCTION_IV]], i64 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT10]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP8:%.*]] = mul <2 x i8> , [[DOTSPLAT11]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDUCTION12:%.*]] = add <2 x i8> [[DOTSPLAT9]], [[TMP8]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP9:%.*]] = mul i8 [[INDUCTION_IV]], 2 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT14:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i64 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT15:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT14]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLATINSERT13:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i64 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[DOTSPLAT14:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT13]], <2 x i8> poison, <2 x i32> zeroinitializer ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br label [[VEC_EPILOG_VECTOR_BODY:%.*]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.vector.body: -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX8:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT19:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND16:%.*]] = phi <2 x i8> [ [[INDUCTION13]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT17:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX18:%.*]] = add i64 1, [[INDEX8]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX18]], 0 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX7:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT18:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND15:%.*]] = phi <2 x i8> [ [[INDUCTION12]], [[VEC_EPILOG_PH]] ], [ [[VEC_IND_NEXT16:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[OFFSET_IDX17:%.*]] = add i64 1, [[INDEX7]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP10:%.*]] = add i64 [[OFFSET_IDX17]], 0 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[TMP10]] ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 0 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> [[VEC_IND16]], ptr [[TMP12]], align 1 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT19]] = add nuw i64 [[INDEX8]], 2 -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT17]] = add <2 x i8> [[VEC_IND16]], [[DOTSPLAT15]] -; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT19]], 84 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: store <2 x i8> [[VEC_IND15]], ptr [[TMP12]], align 1 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[INDEX_NEXT18]] = add nuw i64 [[INDEX7]], 2 +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[VEC_IND_NEXT16]] = add <2 x i8> [[VEC_IND15]], [[DOTSPLAT14]] +; CHECK-PROFITABLE-BY-DEFAULT-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT18]], 84 ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 [[TMP13]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; CHECK-PROFITABLE-BY-DEFAULT: vec.epilog.middle.block: ; CHECK-PROFITABLE-BY-DEFAULT-NEXT: br i1 true, label [[OUTER_LATCH]], label [[VEC_EPILOG_SCALAR_PH]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll index 44e609439c9bf..87a6b4dce8a3d 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll @@ -16,12 +16,12 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) { ; CHECK-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4 -; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8 -; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12 +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4 +; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4 +; CHECK-NEXT: [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4 ; CHECK-NEXT: [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[WIDE_LOAD]]) ; CHECK-NEXT: [[TMP5]] = add i32 [[TMP4]], [[VEC_PHI]] diff --git a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll index 1cce3c837c08b..0b98a054ebea3 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-odd-interleave-counts.ll @@ -14,10 +14,10 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) { ; UF3-NEXT: [[GEP1:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV1]] ; UF3-NEXT: [[GEP2:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV2]] ; UF3-NEXT: [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0 -; UF3-NEXT: [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4 ; UF3-NEXT: [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4 -; UF3-NEXT: [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4 ; UF3-NEXT: [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8 +; UF3-NEXT: [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4 +; UF3-NEXT: [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4 ; UF3-NEXT: [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4 ; UF3-NEXT: [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]] ; UF3-NEXT: [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]] @@ -50,14 +50,14 @@ define i32 @reduction_sum(i64 %n, ptr noalias nocapture %A) { ; UF5-NEXT: [[GEP3:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV3]] ; UF5-NEXT: [[GEP4:%.+]] = getelementptr inbounds i32, ptr %A, i64 [[IV4]] ; UF5-NEXT: [[L_GEP0:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 0 -; UF5-NEXT: [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4 ; UF5-NEXT: [[L_GEP1:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 4 -; UF5-NEXT: [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4 ; UF5-NEXT: [[L_GEP2:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 8 -; UF5-NEXT: [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4 ; UF5-NEXT: [[L_GEP3:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 12 -; UF5-NEXT: [[L3:%.+]] = load <4 x i32>, ptr [[L_GEP3]], align 4 ; UF5-NEXT: [[L_GEP4:%.+]] = getelementptr inbounds i32, ptr [[GEP0]], i32 16 +; UF5-NEXT: [[L0:%.+]] = load <4 x i32>, ptr [[L_GEP0]], align 4 +; UF5-NEXT: [[L1:%.+]] = load <4 x i32>, ptr [[L_GEP1]], align 4 +; UF5-NEXT: [[L2:%.+]] = load <4 x i32>, ptr [[L_GEP2]], align 4 +; UF5-NEXT: [[L3:%.+]] = load <4 x i32>, ptr [[L_GEP3]], align 4 ; UF5-NEXT: [[L4:%.+]] = load <4 x i32>, ptr [[L_GEP4]], align 4 ; UF5-NEXT: [[SUM0_NEXT]] = add <4 x i32> [[SUM0]], [[L0]] ; UF5-NEXT: [[SUM1_NEXT]] = add <4 x i32> [[SUM1]], [[L1]] diff --git a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll index 034400ca2b0fb..a2c5cd7b75fc4 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-inductions.ll @@ -30,18 +30,18 @@ define void @add_ind64_unrolled(ptr noalias nocapture %a, ptr noalias nocapture ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 1 ; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[TMP7]], i64 [[TMP9]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 8 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP10]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] ; CHECK-NEXT: [[TMP12:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i64, ptr [[TMP13]], i64 [[TMP15]] +; CHECK-NEXT: store [[TMP11]], ptr [[TMP13]], align 8 ; CHECK-NEXT: store [[TMP12]], ptr [[TMP16]], align 8 ; CHECK-NEXT: [[TMP17:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP18:%.*]] = shl i64 [[TMP17]], 2 @@ -111,16 +111,16 @@ define void @add_ind64_unrolled_nxv1i64(ptr noalias nocapture %a, ptr noalias no ; CHECK-NEXT: [[VEC_IND:%.*]] = phi [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[STEP_ADD:%.*]] = add [[VEC_IND]], [[DOTSPLAT]] ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]] -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[TMP6]], i64 [[TMP7]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP6]], align 8 ; CHECK-NEXT: [[WIDE_LOAD2:%.*]] = load , ptr [[TMP8]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = add nsw [[WIDE_LOAD]], [[VEC_IND]] ; CHECK-NEXT: [[TMP10:%.*]] = add nsw [[WIDE_LOAD2]], [[STEP_ADD]] ; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; CHECK-NEXT: store [[TMP9]], ptr [[TMP11]], align 8 ; CHECK-NEXT: [[TMP12:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP11]], i64 [[TMP12]] +; CHECK-NEXT: store [[TMP9]], ptr [[TMP11]], align 8 ; CHECK-NEXT: store [[TMP10]], ptr [[TMP13]], align 8 ; CHECK-NEXT: [[TMP14:%.*]] = call i64 @llvm.vscale.i64() ; CHECK-NEXT: [[TMP15:%.*]] = shl i64 [[TMP14]], 1 diff --git a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll index 989d25c0099c5..72b97566d8e71 100644 --- a/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll +++ b/llvm/test/Transforms/LoopVectorize/scalable-loop-unpredicated-body-scalar-tail.ll @@ -45,18 +45,18 @@ ; CHECKUF2: vector.body: ; CHECKUF2: %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ] ; CHECKUF2: %[[IDXB:.*]] = getelementptr inbounds double, ptr %b, i64 %index -; CHECKUF2: %wide.load = load , ptr %[[IDXB]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF2: %[[IDXB_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXB]], i64 %[[VSCALE2]] +; CHECKUF2: %wide.load = load , ptr %[[IDXB]], align 8 ; CHECKUF2: %wide.load{{[0-9]+}} = load , ptr %[[IDXB_NEXT]], align 8 ; CHECKUF2: %[[FADD:.*]] = fadd %wide.load, shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) ; CHECKUF2: %[[FADD_NEXT:.*]] = fadd %wide.load{{[0-9]+}}, shufflevector ( insertelement ( poison, double 1.000000e+00, i64 0), poison, zeroinitializer) ; CHECKUF2: %[[IDXA:.*]] = getelementptr inbounds double, ptr %a, i64 %index -; CHECKUF2: store %[[FADD]], ptr %[[IDXA]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALE2:.*]] = shl i64 %[[VSCALE]], 2 ; CHECKUF2: %[[IDXA_NEXT:.*]] = getelementptr inbounds double, ptr %[[IDXA]], i64 %[[VSCALE2]] +; CHECKUF2: store %[[FADD]], ptr %[[IDXA]], align 8 ; CHECKUF2: store %[[FADD_NEXT]], ptr %[[IDXA_NEXT]], align 8 ; CHECKUF2: %[[VSCALE:.*]] = call i64 @llvm.vscale.i64() ; CHECKUF2: %[[VSCALEX8:.*]] = shl i64 %[[VSCALE]], 3 diff --git a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll index f807c50d4e399..b4f5def8756fb 100644 --- a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll @@ -11,8 +11,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; CHECK: %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0 ; CHECK: %[[T3:.+]] = sub nsw i64 %[[T2]], %x ; CHECK: %[[T4:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T3]] -; CHECK: load <4 x i32>, ptr %[[T4]], align 4 ; CHECK: %[[T6:.+]] = getelementptr inbounds i32, ptr %[[T4]], i64 4 +; CHECK: load <4 x i32>, ptr %[[T4]], align 4 ; CHECK: load <4 x i32>, ptr %[[T6]], align 4 ; CHECK: br {{.*}}, label %middle.block, label %vector.body ; @@ -30,8 +30,8 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128" ; NO-IC: %[[T8:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T6]] ; NO-IC: %[[T9:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T7]] ; NO-IC: %[[T10:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 0 -; NO-IC: load <4 x i32>, ptr %[[T10]], align 4 ; NO-IC: %[[T12:.+]] = getelementptr inbounds i32, ptr %[[T8]], i32 4 +; NO-IC: load <4 x i32>, ptr %[[T10]], align 4 ; NO-IC: load <4 x i32>, ptr %[[T12]], align 4 ; NO-IC: br {{.*}}, label %middle.block, label %vector.body ; diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index 997c786efbcdc..4bcbfb4ac6299 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -29,10 +29,12 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" + ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " WIDEN ir\<%lv\> = load ir\<%arrayidx\>\l" + +; CHECK-NEXT: " EMIT vp\<[[VEC_PTR:%.+]]\> = create-vector-pointer ir\<%arrayidx\>\l" + +; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" + ; CHECK-NEXT: " WIDEN-CALL ir\<%call\> = call @llvm.sqrt.f32(ir\<%lv\>) (using vector intrinsic)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " WIDEN store ir\<%arrayidx2\>, ir\<%call\>\l" + +; CHECK-NEXT: " EMIT vp\<[[VEC_PTR2:%.+]]\> = create-vector-pointer ir\<%arrayidx2\>\l" + +; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = VF * UF + nuw vp\<[[CAN_IV]]\>\l" + ; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<{{.+}}\>\l" + ; CHECK-NEXT: "No successors\l" diff --git a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll index 70ede8dc12b9c..865bc05358bed 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-iv-transforms.ll @@ -16,7 +16,8 @@ define void @iv_no_binary_op_in_descriptor(i1 %c, ptr %dst) { ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi 0, %iv.next.p, ir<1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%dst>, vp<[[STEPS:%.+]]> -; CHECK-NEXT: WIDEN store ir<%gep>, ir<%iv> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%iv> ; CHECK-NEXT: EMIT vp<[[CAN_INC:%.+]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_INC]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll index 3626b8c9dce78..d75c3cfe37288 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing.ll @@ -20,10 +20,12 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN-CALL ir<%call> = call @llvm.sqrt.f32(ir<%lv>) ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%call> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%call> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -68,12 +70,14 @@ define void @print_widen_gep_and_select(i64 %n, ptr noalias %y, ptr noalias %x, ; CHECK-NEXT: WIDEN-INDUCTION %iv = phi %iv.next, 0, ir<1> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: WIDEN-GEP Inv[Var] ir<%arrayidx> = getelementptr inbounds ir<%y>, ir<%iv> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%cmp> = icmp eq ir<%arrayidx>, ir<%z> ; CHECK-NEXT: WIDEN-SELECT ir<%sel> = select ir<%cmp>, ir<1.000000e+01>, ir<2.000000e+01> ; CHECK-NEXT: WIDEN ir<%add> = fadd ir<%lv>, ir<%sel> ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%arrayidx2>, ir<%add> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -120,7 +124,8 @@ define float @print_reduction(i64 %n, ptr noalias %y) { ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> @@ -166,7 +171,8 @@ define void @print_reduction_with_invariant_store(i64 %n, ptr noalias %y, ptr no ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0.000000e+00>, ir<%red.next> ; CHECK-NEXT: vp<[[IV:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%y>, vp<[[IV]]> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + fast reduce.fadd (ir<%lv>) (with final reduction value stored in invariant address sank outside of loop) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> @@ -236,7 +242,8 @@ define void @print_replicate_predicated_phi(i64 %n, ptr %x) { ; CHECK-NEXT: EMIT vp<[[NOT:%.+]]> = not ir<%cmp> ; CHECK-NEXT: BLEND ir<%d> = ir<0>/vp<[[NOT]]> vp<[[PRED]]>/ir<%cmp> ; CHECK-NEXT: CLONE ir<%idx> = getelementptr ir<%x>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%idx>, ir<%d> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%idx> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%d> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -357,9 +364,11 @@ define float @print_fmuladd_strict(ptr %a, ptr %b, i64 %n) { ; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%sum.07> = phi ir<0.000000e+00>, ir<%muladd> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%arrayidx> = getelementptr inbounds ir<%a>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%l.a> = load ir<%arrayidx> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%arrayidx> +; CHECK-NEXT: WIDEN ir<%l.a> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: CLONE ir<%arrayidx2> = getelementptr inbounds ir<%b>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%l.b> = load ir<%arrayidx2> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%arrayidx2> +; CHECK-NEXT: WIDEN ir<%l.b> = load vp<[[VEC_PTR2]]> ; CHECK-NEXT: EMIT vp<[[FMUL:%.+]]> = fmul nnan ninf nsz ir<%l.a>, ir<%l.b> ; CHECK-NEXT: REDUCE ir<[[MULADD:%.+]]> = ir<%sum.07> + nnan ninf nsz reduce.fadd (vp<[[FMUL]]>) ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> @@ -407,7 +416,8 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%isd> = getelementptr inbounds ir<%asd>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%lsd> = load ir<%isd> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%isd> +; CHECK-NEXT: WIDEN ir<%lsd> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%psd> = add nuw nsw ir<%lsd>, ir<23> ; CHECK-NEXT: WIDEN ir<%cmp1> = icmp slt ir<%lsd>, ir<100> ; CHECK-NEXT: WIDEN ir<%cmp2> = icmp sge ir<%lsd>, ir<200> @@ -435,7 +445,8 @@ define void @debug_loc_vpinstruction(ptr nocapture %asd, ptr nocapture %bsd) !db ; CHECK-NEXT: EMIT vp<[[NOT2:%.+]]> = not ir<%cmp2> ; CHECK-NEXT: EMIT vp<[[SEL2:%.+]]> = select vp<[[NOT1]]>, vp<[[NOT2]]>, ir ; CHECK-NEXT: BLEND ir<%ysd.0> = vp<[[PHI]]>/vp<[[OR1]]> ir<%psd>/vp<[[SEL2]]> -; CHECK-NEXT: WIDEN store ir<%isd>, ir<%ysd.0> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%isd> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%ysd.0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -550,7 +561,8 @@ define i32 @print_exit_value(ptr %ptr, i32 %off) { ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr inbounds ir<%ptr>, vp<[[STEPS]]> ; CHECK-NEXT: WIDEN ir<%add> = add ir<%iv>, ir<%off> -; CHECK-NEXT: WIDEN store ir<%gep>, ir<0> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<0> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -594,12 +606,14 @@ define void @print_fast_math_flags(i64 %n, ptr noalias %y, ptr noalias %x, ptr % ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.y> = getelementptr inbounds ir<%y>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%gep.y> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.y> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%add> = fadd nnan ir<%lv>, ir<1.000000e+00> ; CHECK-NEXT: WIDEN ir<%mul> = fmul reassoc nnan ninf nsz arcp contract afn ir<%add>, ir<2.000000e+00> ; CHECK-NEXT: WIDEN ir<%div> = fdiv reassoc nsz contract ir<%mul>, ir<2.000000e+00> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%gep.x>, ir<%div> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.x> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR]]>, ir<%div> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -644,11 +658,13 @@ define void @print_exact_flags(i64 %n, ptr noalias %x) { ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep.x> = getelementptr inbounds ir<%x>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%lv> = load ir<%gep.x> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep.x> +; CHECK-NEXT: WIDEN ir<%lv> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%div.1> = udiv exact ir<%lv>, ir<20> ; CHECK-NEXT: WIDEN ir<%div.2> = udiv ir<%lv>, ir<60> ; CHECK-NEXT: WIDEN ir<%add> = add nuw nsw ir<%div.1>, ir<%div.2> -; CHECK-NEXT: WIDEN store ir<%gep.x>, ir<%add> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%gep.x> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%add> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors @@ -692,7 +708,8 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]> ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%ld.addr> = getelementptr inbounds ir<%src>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%ld.value> = load ir<%ld.addr> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%ld.addr> +; CHECK-NEXT: WIDEN ir<%ld.value> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%ifcond> = fcmp oeq ir<%ld.value>, ir<5.000000e+00> ; CHECK-NEXT: Successor(s): pred.call ; CHECK-EMPTY: @@ -718,7 +735,8 @@ define void @print_call_flags(ptr readonly %src, ptr noalias %dest, i64 %n) { ; CHECK-NEXT: EMIT vp<[[NOT_COND:%.+]]> = not ir<%ifcond> ; CHECK-NEXT: BLEND ir<%st.value> = ir<%ld.value>/vp<[[NOT_COND]]> ir<%fadd>/ir<%ifcond> ; CHECK-NEXT: CLONE ir<%st.addr> = getelementptr inbounds ir<%dest>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN store ir<%st.addr>, ir<%st.value> +; CHECK-NEXT: EMIT vp<[[VEC_PTR2:%.+]]> = create-vector-pointer ir<%st.addr> +; CHECK-NEXT: WIDEN store vp<[[VEC_PTR2]]>, ir<%st.value> ; CHECK-NEXT: EMIT vp<[[CAN_IV_NEXT]]> = VF * UF + nuw vp<[[CAN_IV]]> ; CHECK-NEXT: EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]> ; CHECK-NEXT: No successors diff --git a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll index afc46de9a5a16..bddeca00c8ae7 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -962,7 +962,8 @@ define void @sinking_requires_duplication(ptr %addr) { ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1> ; CHECK-NEXT: CLONE ir<%gep> = getelementptr ir<%addr>, vp<[[STEPS]]> -; CHECK-NEXT: WIDEN ir<%0> = load ir<%gep> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%gep> +; CHECK-NEXT: WIDEN ir<%0> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%pred> = fcmp oeq ir<%0>, ir<0.000000e+00> ; CHECK-NEXT: EMIT vp<[[MASK:%.+]]> = not ir<%pred> ; CHECK-NEXT: Successor(s): pred.store @@ -1100,7 +1101,8 @@ define void @ptr_induction_remove_dead_recipe(ptr %start, ptr %end) { ; CHECK-NEXT: EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ; CHECK-NEXT: EMIT ir<%ptr.iv> = WIDEN-POINTER-INDUCTION ir<%start>, -1 ; CHECK-NEXT: CLONE ir<%ptr.iv.next> = getelementptr inbounds ir<%ptr.iv>, ir<-1> -; CHECK-NEXT: WIDEN ir<%l> = load ir<%ptr.iv.next> +; CHECK-NEXT: EMIT vp<[[VEC_PTR:%.+]]> = create-vector-pointer ir<%ptr.iv.next> +; CHECK-NEXT: WIDEN ir<%l> = load vp<[[VEC_PTR]]> ; CHECK-NEXT: WIDEN ir<%c.1> = icmp eq ir<%l>, ir<0> ; CHECK-NEXT: EMIT vp<[[NEG:%.+]]> = not ir<%c.1> ; CHECK-NEXT: Successor(s): pred.store